py编写blog爬虫初步
教程指导
urllib2.HTTPError: HTTP Error 403: Forbidden
需添加header以伪装成浏览器访问。
UnicodeEncodeError: ‘gbk’ codec can’t encode character u’\u200e’ in position 43: illegal multibyte sequence
应在编码转换时略去无关紧要的不可见字符。参见
import urllib
import urllib2
import re
import os
def mkdir(path):
path = path.strip()
isExists=os.path.exists(path)
if not isExists:
os.makedirs(path)
return True
else:
return False
def output_file(dir,name,content,charset):
fileName = dir + "/" + name#.encode('utf-8','ignore')
f = open(fileName,"w+")
f.write(content.encode(charset,'ignore'))
print "Output file",fileName
def scan_article(host,link,dir,charset):
url=host+link
request = urllib2.Request(url)
# disguising web browser headers
request.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36')
response=urllib2.urlopen(request)
html= response.read().decode('utf-8','ignore')
# print html
pattern=re.compile(r'\s*([\s\S]*?)\s*')
matches=re.findall(pattern,html)
if matches:
title=matches[0]
# filename=re.sub("\s+","_",title)
filename=re.sub(r'[\s\\\\\\/:\\*\\?"<>\\|]+',"_",title)
#print title,"[",filename,"]"
else:
print "No title matches"
return
pattern=re.compile(r'\s*([\s\S]*?)\s*\s*')
matches=re.findall(pattern,html)
if matches:
html=matches[0]
# print html
else:
print "No contents"
return
# print "Output file",filename+'.html'
try:
output_file(dir,filename+'.html',html,charset);
except Exception as e:
print str(e)
return
def scan_page(id,host,url,dir,charset):
request = urllib2.Request(host+url+str(id))
# disguising web browser headers
request.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36')
response=urllib2.urlopen(request)
html= response.read().decode('utf-8','ignore')
# print html;
pattern=re.compile(r'([\s\S]*?)',re.I|re.M)
items=re.findall(pattern,html)
if items:
# print items
for item in items:
next=re.match(re.compile(r'\s*([\s\S]+\S)\s*'),item)
if next:
href=next.group(1)
title=next.group(2)
scan_article(host,href,dir,charset)
# print href,"->",title,"[",filename,"]"
else:
print "Invalid item"
return
else:
print "No title matches"
return
dir='data/csdn_utf-8';
host="http://blog.csdn.net"
url="/u013491262/article/list/"
charset='utf-8'
mkdir(dir)
# scan_article(host,"/u013491262/article/details/20783371",dir,'utf-8')
for i in range(28,31):
print "page ",str(i),":"
dir='data/csdn_utf-8'+"/"+str(i).zfill(2)
mkdir(dir)
scan_page(i,host,url,dir,charset)
py编写blog爬虫初步