py编写blog爬虫初步

教程指导

urllib2.HTTPError: HTTP Error 403: Forbidden

需添加header以伪装成浏览器访问。

UnicodeEncodeError: ‘gbk’ codec can’t encode character u’\u200e’ in position 43: illegal multibyte sequence

应在编码转换时略去无关紧要的不可见字符。参见

import urllib
import urllib2
import re
import os

def mkdir(path):
    path = path.strip()

    isExists=os.path.exists(path)

    if not isExists:
        os.makedirs(path)
        return True
    else:
        return False

def output_file(dir,name,content,charset):
    fileName = dir + "/" + name#.encode('utf-8','ignore')
    f = open(fileName,"w+")
    f.write(content.encode(charset,'ignore'))
    print "Output file",fileName

def scan_article(host,link,dir,charset):
    url=host+link

    request = urllib2.Request(url)
    # disguising web browser headers
    request.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36')

    response=urllib2.urlopen(request)

    html= response.read().decode('utf-8','ignore')

    # print html

    pattern=re.compile(r'\s*([\s\S]*?)\s*')

    matches=re.findall(pattern,html)

    if matches:
        title=matches[0]
        # filename=re.sub("\s+","_",title)
        filename=re.sub(r'[\s\\\\\\/:\\*\\?"<>\\|]+',"_",title)
        #print title,"[",filename,"]"
    else:
        print "No title matches"
        return

    pattern=re.compile(r'
\s*([\s\S]*?)\s*
\s*
') matches=re.findall(pattern,html) if matches: html=matches[0] # print html else: print "No contents" return # print "Output file",filename+'.html' try: output_file(dir,filename+'.html',html,charset); except Exception as e: print str(e) return def scan_page(id,host,url,dir,charset): request = urllib2.Request(host+url+str(id)) # disguising web browser headers request.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36') response=urllib2.urlopen(request) html= response.read().decode('utf-8','ignore') # print html; pattern=re.compile(r'([\s\S]*?)',re.I|re.M) items=re.findall(pattern,html) if items: # print items for item in items: next=re.match(re.compile(r'\s*([\s\S]+\S)\s*'),item) if next: href=next.group(1) title=next.group(2) scan_article(host,href,dir,charset) # print href,"->",title,"[",filename,"]" else: print "Invalid item" return else: print "No title matches" return dir='data/csdn_utf-8'; host="http://blog.csdn.net" url="/u013491262/article/list/" charset='utf-8' mkdir(dir) # scan_article(host,"/u013491262/article/details/20783371",dir,'utf-8') for i in range(28,31): print "page ",str(i),":" dir='data/csdn_utf-8'+"/"+str(i).zfill(2) mkdir(dir) scan_page(i,host,url,dir,charset)
Author

Semprathlon / Simfae Dean

Posted on

07/12/2016

Updated on

07/19/2023

Licensed under

Comments