Python 爬虫框架

学Python的应该都写过爬虫吧,如果希望提高爬虫的效率就要用到并发,可以选择的用多线程、多进程,还有最近很火的Gevent,据说是基于一种新的概念,协程,不管什么程,总之好用就行了。写一个爬虫有一系列的东西需要处理,如果有一个好用的框架就会事半功倍。
在42qu.com源码里面,有个教主写的爬虫框架,很好用,是基于Gevent的,处理url采用和web开发类似的映射方法,教主还写了一个简单的html处理库,extract,虽没有Beautifulsoup那样强大,但是简单好用,基本能满足常见的要求。HTTP请求使用了requests,requests是个处理HTTP的库,用官方的话说“Requests is an ISC Licensed HTTP library, written in Python, for human beings.”,for human beings,好吧,废话不说,上代码:
这里我将会把所需代码都贴出来,并贴一个示例。由于windows平台貌似没有Gevent支持,我还改写了一个多线程版本,一并放出。
首先是extract:

#coding:utf-8
import re

def extract(begin, end, html):
    if not html:
        return ''
    start = html.find(begin)
    if start >= 0:
        start += len(begin)
        if end is not None:
            end = html.find(end, start)
        if end is None or end >= 0:
            return html[start:end].strip()

def extract_all(begin, end, html):
    return map(str.strip, _extract_all(begin, end, html))

def _extract_all(begin, end, html):
    if not html:
        return ''
    result = []
    from_pos = 0
    while True:
        start = html.find(begin, from_pos)
        if start >= 0:
            start += len(begin)
            endpos = html.find(end, start)
            if endpos >= 0:
                result.append(html[start:endpos])
                from_pos = endpos+len(end)
                continue
        break
    return result

def line_strip(txt):
    if not txt:
        return ''
    txt = txt.replace(' ', ' ').split('\n')
    return '\n'.join(i for i in [i.strip() for i in txt] if i)

def extract_strip(begin, end, html):
    if not html:
        return ''
    t = extract(begin, end, html)
    if t:
        return strip_line(t)


def extract_map(begin, end, html, func):
    txt = []
    result = []
    prepos = None
    preend = 0
    len_end = len(end)
    len_begin = len(begin)
    while True:
        if prepos is None:
            pos = html.find(begin)
        else:
            pos = html.find(begin, prepos)
        if pos >= 0:
            end = html.find(end, pos)
        if pos < 0 or end < 0:
            result.append(html[preend:])
            break
        end = end+len_end
        result.append(html[preend:pos])
        tmp = func(html[pos:end])
        if tmp:
            result.append(tmp)
        prepos = pos+len_begin
        preend = end

    return ''.join(result)


if __name__ == '__main__':

    pass

然后是爬虫框架,下面是原版

#coding:utf-8
import _env
from gevent.queue import Empty, Queue
import gevent
import gevent.monkey
import requests
from urlparse import urlparse, parse_qs
import re
gevent.monkey.patch_all()


class Bot(object):
    cookie = None
    headers = {}

    def __init__(self, route):
        self.queue = Queue()
        self.route = route

    def _fetch(self):
        queue = self.queue
        timeout = self.timeout
        route = self.route
        while True:
            try:
                url = queue.get(timeout=timeout+10)
            except Empty:
                return

            headers = self.headers

            if self.cookie:
                headers['Cookie'] = self.cookie
            req = requests.get(url, timeout=timeout, headers=headers, proxies=self.proxies)
            p = urlparse(req.url)

            cls, args = route.match(p.path)
            if cls:
                o = cls(req)
                r = o.get(*args)
                if r:
                    for i in r:
                        if i:
                            queue.put(i)

    def run(self, num=10, timeout=60, proxies={}, cookie=None):
        self.proxies = proxies
        self.timeout = timeout
        self.cookie = cookie
        for i in xrange(num):
            g = gevent.spawn(self._fetch)
        g.join()

    def put(self, url):
        self.queue.put(url)

class Route(object):
    def __init__(self):
        self.map = []

    def match(self, url):
        for r, f in self.map:
            m = r.match(url)
            if m:
                return f, m.groups()
        return None, None

    def __call__(self, path):
        if not path.endswith('$'):
            path += '$'
        re_path = re.compile(path)
        def _(func):
            self.map.append((re_path, func))
            return func
        return _


route = Route()
bot = Bot(route)

from extract import extract, extract_all

class Page(object):

    def __init__(self, req):
        p = urlparse(req.url)
        req.arguments = parse_qs(p.query, 1)
        self.req = req
        self.html = req.content

    def get_argument(self, name, default=None):
        result = self.req.arguments.get(name, None)
        if result is None:
            return default
        return result[0].encode('utf-8', 'ignore')

    def extract(self, begin, end):
        return extract(begin, end, self.html)

    def extract_all(self, begin, end):
        return extract_all(begin, end, self.html)

然后是线程版的,适用于windows平台或者未安装Gevent的情况:

#coding:utf-8
import _env

import re
import time
import requests
from Queue import Queue
from threading import Thread
from urlparse import urlparse, parse_qs

ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1'

class Bot(object):
    cookie = None
    headers = {}

    def __init__(self, route):
        self.queue = Queue()
        self.route = route

    def _fetch(self):
        queue = self.queue
        timeout = self.timeout
        route = self.route
        while True:
            try:
                url = queue.get(timeout=timeout+10)
            except:
                return

            headers = self.headers

            if self.cookie:
                headers['Cookie'] = self.cookie
            headers['User-Agent'] = ua
            req = requests.get(url, timeout=timeout, headers=headers, proxies=self.proxies)
            time.sleep(3)
            p = urlparse(req.url)

            cls, args = route.match(p.path)
            if cls:
                o = cls(req)
                r = o.get(*args)
                if r:
                    for i in r:
                        if i:
                            queue.put(i)

    def run(self, num=6, timeout=10, proxies={}, cookie=cookie):
        self.timeout = timeout
        self.proxies = proxies
        self.cookie = cookie
        g = Thread()
        g.setDaemon(1)
        g.start()
        for i in xrange(num):
            g = Thread(target=self._fetch)
            g.start()
            #g.join()

    def put(self, url):
        self.queue.put(url)

class Route(object):
    def __init__(self):
        self.map = []

    def match(self, url):
        for r, f in self.map:
            m = r.match(url)
            if m:
                return f, m.groups()
        return None, None

    def __call__(self, path):
        if not path.endswith('$'):
            path += '$'
        re_path = re.compile(path)
        def _(func):
            self.map.append((re_path, func))
            return func
        return _


route = Route()
bot = Bot(route)

from extract import extract, extract_all

class Page(object):

    def __init__(self, req):
        p = urlparse(req.url)
        req.arguments = parse_qs(p.query, 1)
        self.req = req
        self.html = req.content

    def get_argument(self, name, default=None):
        result = self.req.arguments.get(name, None)
        if result is None:
            return default
        return result[0].encode('utf-8', 'ignore')

    def extract(self, begin, end):
        return extract(begin, end, self.html)

    def extract_all(self, begin, end):
        return extract_all(begin, end, self.html)

线程版和Gevent用法完全一样,下面是一个抓取豆瓣相册的例子:

#!/usr/bin/python

# -*- coding: utf-8 -*-
import _env
import _bot
import os
import urllib
from os.path import join, exists
from _bot import bot, route, Page
from extract import extract, extract_all

url_base = 'http://img1.douban.com/view/photo/raw/public/p%s.jpg'
url_tmp = 'http://movie.douban.com/subject/3395373/photos?type=S&start=%s&sortby=vote&size=a&subtype=a'
f_base = '/home/lerry/movie'
#f_base = u'E:\Download\photo\蝙蝠侠'
f_tmp = join(f_base, 'p%s.jpg')


@route('/subject/3395373/photos')
class url_list(Page):
    def get(self):
        page = self.get_argument('start')
        page = int(page)
        if page == 0:
            for i in xrange(1, 41):
                start = i*40
                url = url_tmp % start
                yield url

        li = self.extract('class="poster-col4 clearfix"', '</ul>')
        for i in extract_all('<li>', '</li>', li):
            path = extract('a href="', '/">', i)
            if not path:
                continue
            id = path[37:]
            fpath = f_tmp % id
            if exists(fpath):
                continue
            url = url_base % id
            yield url

@route('/view/photo/raw/public/(.+)')
class single(Page):
    def get(self, arg):
        save_pic(self.html, arg)

def save_pic(content, fname):
    fpath = join(f_base, fname)
    f = open(fpath, 'wb')
    f.write(content)
    f.close()
    print fname, 'saved'

if __name__ == '__main__':
    bot.put('http://movie.douban.com/subject/3395373/photos?type=S&start=0&sortby=vote&size=a&subtype=a')
    bot.run()
2012-09-15 20:493287pythonrequestsgevent