Posted 07/12/2016Updated 08/18/2025 Semprathlon / Simfae Dean Programing3 minutes read (About 438 words)

py编写blog爬虫初步

教程指导

urllib2.HTTPError: HTTP Error 403: Forbidden

需添加header以伪装成浏览器访问。

UnicodeEncodeError: ‘gbk’ codec can’t encode character u’\u200e’ in position 43: illegal multibyte sequence

应在编码转换时略去无关紧要的不可见字符。参见

import urllib
import urllib2
import re
import os

def mkdir(path):
    path = path.strip()

    isExists=os.path.exists(path)

    if not isExists:
        os.makedirs(path)
        return True
    else:
        return False

def output_file(dir,name,content,charset):
    fileName = dir + "/" + name#.encode('utf-8','ignore')
    f = open(fileName,"w+")
    f.write(content.encode(charset,'ignore'))
    print "Output file",fileName

def scan_article(host,link,dir,charset):
    url=host+link

    request = urllib2.Request(url)
    # disguising web browser headers
    request.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36')

    response=urllib2.urlopen(request)

    html= response.read().decode('utf-8','ignore')

    # print html

    pattern=re.compile(r'\s*([\s\S]*?)\s*')

    matches=re.findall(pattern,html)

    if matches:
        title=matches[0]
        # filename=re.sub("\s+","_",title)
        filename=re.sub(r'[\s\\\\\\/:\\*\\?"<>\\|]+',"_",title)
        #print title,"[",filename,"]"
    else:
        print "No title matches"
        return

    pattern=re.compile(r'\s*([\s\S]*?)\s*
\s*')

    matches=re.findall(pattern,html)

    if matches:
        html=matches[0]
        # print html
    else:
        print "No contents"
        return

    # print "Output file",filename+'.html'

    try:
        output_file(dir,filename+'.html',html,charset);
    except Exception as e:
        print str(e)
        return

def scan_page(id,host,url,dir,charset):

    request = urllib2.Request(host+url+str(id))
    # disguising web browser headers
    request.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36')

    response=urllib2.urlopen(request)

    html= response.read().decode('utf-8','ignore')

    # print html;

    pattern=re.compile(r'([\s\S]*?)',re.I|re.M)

    items=re.findall(pattern,html)

    if items:
        # print items
        for item in items:
            next=re.match(re.compile(r'\s*([\s\S]+\S)\s*'),item)
            if next:
                href=next.group(1)
                title=next.group(2)

                scan_article(host,href,dir,charset)
                # print href,"->",title,"[",filename,"]"
            else:
                print "Invalid item"
                return
    else:
        print "No title matches"
        return

dir='data/csdn_utf-8';
host="http://blog.csdn.net"
url="/u013491262/article/list/"
charset='utf-8'
mkdir(dir)
# scan_article(host,"/u013491262/article/details/20783371",dir,'utf-8')
for i in range(28,31):
    print "page ",str(i),":"
    dir='data/csdn_utf-8'+"/"+str(i).zfill(2)
    mkdir(dir)
    scan_page(i,host,url,dir,charset)

py编写blog爬虫初步

https://devblog.citruxonve.net/posts/ba53dafc/

Author

Semprathlon / Simfae Dean

Posted on

07/12/2016

Updated on

08/18/2025

Licensed under

#Python

py编写blog爬虫初步

教程指导

Author

Posted on

Updated on

Licensed under

Comments

Categories

Tags

Subscribe for updates

Links