티스토리 워드프레스 블로그 이전 파이썬 스크립트 오류 수정판2
2025.05.29 - [컴퓨터 인터넷 모바일 it/블로그 애드센스 등] - 티스토리 블로그 워드프레스 이전 파이썬 스크립트
2025.06.07 - [컴퓨터 인터넷 모바일 it/블로그 애드센스 등] - 티스토리 워드프레스 블로그 이전 파이썬 스크립트 오류 수정판
이전 버전에서 다음과 같은 오류가 발생했습니다.
[357/721] 크롤링: https://tistory.hanuhyunu.pw/entry/%EC%9E%90%EB%8F%99%EC%B0%A8-%EA%B2%BD%EA%B3%A0%EB%93%B1-%EC%A2%85%EB%A5%98-%EC%A0%95%EB%A6%AC
python : Traceback (most recent call last):
위치 줄:1 문자:1
+ python tistory_to_wp.py --sitemap sitemap.xml --output wordpress_expo ...
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ CategoryInfo : NotSpecified: (Traceback (most recent call last)::String) [], RemoteException
+ FullyQualifiedErrorId : NativeCommandError
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\connection.py", line 198, in _new_conn
sock = connection.create_connection(
(self._dns_host, self.port),
...<2 lines>...
socket_options=self.socket_options,
)
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\util\connection.py", line 60, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\socket.py", line 977, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
socket.gaierror: [Errno 11002] getaddrinfo failed
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\connectionpool.py", line 787, in urlopen
response = self._make_request(
conn,
...<10 lines>...
**response_kw,
)
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\connectionpool.py", line 488, in _make_request
raise new_e
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\connectionpool.py", line 464, in _make_request
self._validate_conn(conn)
~~~~~~~~~~~~~~~~~~~^^^^^^
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\connectionpool.py", line 1093, in _validate_conn
conn.connect()
~~~~~~~~~~~~^^
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\connection.py", line 704, in connect
self.sock = sock = self._new_conn()
~~~~~~~~~~~~~~^^
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\connection.py", line 205, in _new_conn
raise NameResolutionError(self.host, self, e) from e
urllib3.exceptions.NameResolutionError: <urllib3.connection.HTTPSConnection object at 0x00000186914E39D0>: Failed to resolve 'tistory.hanuhyunu.pw' ([Errno 11002] get
addrinfo failed)
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\requests\adapters.py", line 667, in send
resp = conn.urlopen(
method=request.method,
...<9 lines>...
chunked=chunked,
)
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\connectionpool.py", line 841, in urlopen
retries = retries.increment(
method, url, error=new_e, _pool=self, _stacktrace=sys.exc_info()[2]
)
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\urllib3\util\retry.py", line 519, in increment
raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='tistory.hanuhyunu.pw', port=443): Max retries exceeded with url: /entry/%EC%9E%90%EB%8F%99%EC%B0%A8-%EA%B2
%BD%EA%B3%A0%EB%93%B1-%EC%A2%85%EB%A5%98-%EC%A0%95%EB%A6%AC (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000186914E39D0>: Failed
to resolve 'tistory.hanuhyunu.pw' ([Errno 11002] getaddrinfo failed)"))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\WorkPY\BlogExport\tistory_to_wp.py", line 203, in main
posts.append(fetch_post(url))
~~~~~~~~~~^^^^^
File "D:\WorkPY\BlogExport\tistory_to_wp.py", line 45, in fetch_post
resp = requests.get(url, headers=headers, timeout=10)
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\requests\api.py", line 73, in get
return request("get", url, params=params, **kwargs)
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\requests\api.py", line 59, in request
return session.request(method=method, url=url, **kwargs)
~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\requests\sessions.py", line 589, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\requests\sessions.py", line 703, in send
r = adapter.send(request, **kwargs)
File "C:\Users\hidec\AppData\Local\Programs\Python\Python313\Lib\site-packages\requests\adapters.py", line 700, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='tistory.hanuhyunu.pw', port=443): Max retries exceeded with url: /entry/%EC%9E%90%EB%8F%99%EC%B0%A8-%EA
%B2%BD%EA%B3%A0%EB%93%B1-%EC%A2%85%EB%A5%98-%EC%A0%95%EB%A6%AC (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000186914E39D0>: Fail
ed to resolve 'tistory.hanuhyunu.pw' ([Errno 11002] getaddrinfo failed)"))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\WorkPY\BlogExport\tistory_to_wp.py", line 216, in <module>
main()
~~~~^^
File "D:\WorkPY\BlogExport\tistory_to_wp.py", line 205, in main
print(f" \u26a0\ufe0f 실패: {e}")
~~~~~^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'cp949' codec can't encode character '\u26a0' in position 2: illegal multibyte sequence
아래는 CP949 콘솔 환경에서도 멈추지 않도록, 모든 화살표(→
)와 이모지(⚠️
, 📥
등)를 제거하고 순수 ASCII 문자 또는 한글만 출력하도록 바꾼 버전입니다. 이제 print
에서 UnicodeEncodeError가 발생하지 않습니다.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from lxml import etree
import re
import urllib.parse
from xml.etree import ElementTree as ET
def parse_sitemap(path):
tree = ET.parse(path)
root = tree.getroot()
ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
return [el.text for el in root.findall('.//ns:loc', ns) if '/entry/' in el.text]
def clean_entry_content(html):
soup = BeautifulSoup(html, 'html.parser')
for toc in soup.select('div.book-toc'):
toc.decompose()
for tag in soup.find_all(['script', 'ins', 'iframe']):
tag.decompose()
pattern = re.compile(r'ca-pub-\d+')
for tag in soup.find_all(True):
try:
if tag.string and pattern.search(tag.string):
tag.decompose()
continue
attrs = tag.attrs or {}
if any(pattern.search(str(v)) for v in attrs.values()):
tag.decompose()
continue
style = attrs.get('style', '')
if any(x in style for x in ['adsbygoogle', 'overflow:hidden']):
tag.decompose()
continue
if attrs.get('data-tistory-react-app') in ['NaverAd','Reaction']:
tag.decompose()
continue
if attrs.get('id') == 'adsense_script':
tag.decompose()
continue
except:
pass
for btn in soup.find_all('button'):
btn.decompose()
return soup.decode_contents()
def fetch_post(url):
headers = {'User-Agent': 'Mozilla/5.0'}
resp = requests.get(url, headers=headers, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')
title = soup.find('title').get_text().strip() if soup.find('title') else url
date_str = ''
if span := soup.find('span', class_='date'):
date_str = span.get_text().strip()
elif time_tag := soup.find('time'):
date_str = time_tag.get('datetime', '')
try:
m = re.match(r"(\d{4})\.\s*(\d{1,2})\.\s*(\d{1,2})\.", date_str)
pub = datetime(*map(int, m.groups())) if m else datetime.fromisoformat(date_str)
except:
pub = datetime.now()
rss_pub = pub.astimezone(timezone.utc) if pub.tzinfo else pub
pubDate = rss_pub.strftime('%a, %d %b %Y %H:%M:%S +0000')
cont = (
soup.find('div', class_='entry-content')
or soup.find('article')
or soup.find('section')
)
raw_html = cont.decode_contents() if cont else ''
cleaned = clean_entry_content(raw_html)
guid_url = (soup.find("meta", property="dg:plink") or {}).get("content", url)
return title, pubDate, pub, cleaned, url, guid_url
def sanitize_xml(text: str) -> str:
# XML에 유효하지 않은 제어문자 제거 (0x00–0x08,0x0B–0x0C,0x0E–0x1F)
return re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', '', text)
def build_wxr(posts, out_path, base_url, start_id=0):
print(f"[빌드 시작] {out_path} (포스트 {start_id+1}~{start_id+len(posts)})")
NSMAP = {
'excerpt': "http://wordpress.org/export/1.2/excerpt/",
'content': "http://purl.org/rss/1.0/modules/content/",
'dc': "http://purl.org/dc/elements/1.1/",
'wp': "http://wordpress.org/export/1.2/"
}
wp_ns = NSMAP['wp']
root = etree.Element('rss', nsmap=NSMAP)
root.set('version', '2.0')
channel = etree.SubElement(root, 'channel')
def add(parent, tag, text):
el = etree.SubElement(parent, tag)
if tag == 'title':
el.text = etree.CDATA(text)
elif tag == '{http://purl.org/dc/elements/1.1/}creator':
el.text = etree.CDATA(text)
elif tag.startswith(f'{{{wp_ns}}}'):
el.text = text.isdigit() and text or etree.CDATA(text)
else:
el.text = text
return el
# 채널 메타
add(channel, 'title', 'Tistory 백업')
add(channel, 'link', base_url)
add(channel, 'description', '')
add(channel, 'language', 'ko-KR')
add(channel, '{http://wordpress.org/export/1.2/}wxr_version', '1.2')
add(channel, '{http://wordpress.org/export/1.2/}base_site_url', base_url)
add(channel, '{http://wordpress.org/export/1.2/}base_blog_url', base_url)
# author
auth = etree.SubElement(channel, '{http://wordpress.org/export/1.2/}author')
add(auth, '{http://wordpress.org/export/1.2/}author_id', '1')
add(auth, '{http://wordpress.org/export/1.2/}author_login', 'admin')
add(auth, '{http://wordpress.org/export/1.2/}author_email', 'admin@blog.com')
add(auth, '{http://wordpress.org/export/1.2/}author_display_name', 'admin')
# items
for idx, (title, pubDate, pub, content, link, guid_url) in enumerate(posts, start=start_id+1):
print(f" - 아이템 #{idx}")
item = etree.SubElement(channel, 'item')
add(item, 'title', title)
add(item, 'link', link)
add(item, 'pubDate', pubDate)
add(item, '{http://purl.org/dc/elements/1.1/}creator', 'admin')
guid_el = etree.SubElement(item, 'guid', isPermaLink="false")
guid_el.text = guid_url
etree.SubElement(item, 'description')
ce = etree.SubElement(item, '{http://purl.org/rss/1.0/modules/content/}encoded')
minified = re.sub(r'>\s+<','><',
content.replace('\n','').replace('\r','').replace('\t',''))
ce.text = etree.CDATA(sanitize_xml(minified))
dt_wp = pub.strftime('%Y-%m-%d %H:%M:%S')
dt_gmt = pub.astimezone(timezone.utc).strftime('%Y-%m-%d %H:%M:%S') \
if pub.tzinfo else dt_wp
add(item, '{http://wordpress.org/export/1.2/}post_id', str(idx))
add(item, '{http://wordpress.org/export/1.2/}post_date', dt_wp)
add(item, '{http://wordpress.org/export/1.2/}post_date_gmt', dt_gmt)
# 나머지 wp 메타
add(item, '{http://wordpress.org/export/1.2/}comment_status', 'open')
add(item, '{http://wordpress.org/export/1.2/}ping_status', 'open')
slug = link.rstrip('/').split('/')[-1]
add(item, '{http://wordpress.org/export/1.2/}post_name', slug)
add(item, '{http://wordpress.org/export/1.2/}status', 'publish')
add(item, '{http://wordpress.org/export/1.2/}post_parent', '0')
add(item, '{http://wordpress.org/export/1.2/}menu_order', '0')
add(item, '{http://wordpress.org/export/1.2/}post_type', 'post')
add(item, '{http://wordpress.org/export/1.2/}post_password', '')
add(item, '{http://wordpress.org/export/1.2/}is_sticky', '0')
cat = etree.Element('category', domain="category", nicename="uncategorized")
cat.text = "Uncategorized"
item.append(cat)
meta = etree.SubElement(item, '{http://wordpress.org/export/1.2/}postmeta')
add(meta, '{http://wordpress.org/export/1.2/}meta_key', '_edit_last')
add(meta, '{http://wordpress.org/export/1.2/}meta_value', '1')
tree = etree.ElementTree(root)
tree.write(out_path, encoding='utf-8',
pretty_print=True, xml_declaration=True)
remove_xml_comments(out_path)
print(f"[완료] {out_path}\n")
def remove_xml_comments(path):
import re
txt = open(path, 'r', encoding='utf-8').read()
for pat in [
r'<!--\s*System\s*-\s*START\s*-->',
r'<!--\s*System\s*-\s*END\s*-->',
r'<!--\s*inventory\s*-->',
r'<!--\s*PostListinCategory\s*-\s*START\s*-->',
r'<!--\s*PostListinCategory\s*-\s*END\s*-->'
]:
txt = re.sub(pat, '', txt, flags=re.IGNORECASE)
with open(path, 'w', encoding='utf-8') as f:
f.write(txt)
def main():
parser = argparse.ArgumentParser(description='Tistory → WordPress 변환기')
parser.add_argument('--sitemap', default='sitemap.xml', help='Sitemap XML 경로')
parser.add_argument('--output', default='wordpress_export_final.xml', help='출력 WXR 파일명')
args = parser.parse_args()
urls = parse_sitemap(args.sitemap)
base = urllib.parse.urlparse(urls[0])
base_url = f"{base.scheme}://{base.netloc}"
posts = []
total = len(urls)
for i, url in enumerate(urls, start=1):
print(f"[{i}/{total}] 크롤링: {url}")
try:
posts.append(fetch_post(url))
except Exception as e:
print(f" 경고: {e}")
prefix, ext = args.output.rsplit('.', 1)
for chunk_idx in range(0, len(posts), 100):
chunk = posts[chunk_idx:chunk_idx+100]
part = chunk_idx // 100 + 1
out_file = f"{prefix}-{part:03d}.{ext}"
build_wxr(chunk, out_file, base_url, start_id=chunk_idx)
if __name__ == '__main__':
main()
변경 요약
print
에서 화살표(→
), 이모지(⚠️
,📥
)를 모두 제거하고 ASCII/한글만 사용했습니다.- 예외 발생 시
print(f" 경고: {e}")
로 대체했습니다. - 내부
sanitize_xml()
로 CDATA에 넣기 전 제어문자를 제거해 멈추지 않게 했습니다.
'컴퓨터 인터넷 모바일 it > 블로그 애드센스 등' 카테고리의 다른 글
워드프레스 FIFU 플러그인 CDN 문제 - 티스토리 이미지 링크 업데이트로 생긴 이미지 깨짐 문제 완벽 해결 가이드 (0) | 2025.07.06 |
---|---|
chatGPT 티스토리 블로그 마크다운 옮길 때 소소한 수정 팁 (0) | 2025.07.04 |
티스토리 워드프레스 블로그 이전 파이썬 스크립트 오류 수정판 (0) | 2025.06.07 |
티스토리 블로그 애드센스 무효트래픽 방지코드, 티스토리 링크 버튼 로그인제한 30일 징계 (0) | 2025.06.07 |
IndexNow 빙 자동 색인 요청하기 - 워드프레스 CrawlWP 플러그인 (0) | 2025.06.02 |
댓글