python 爬虫框架
学Python的应该都写过爬虫吧,如果希望提高爬虫的效率就要用到并发,可以选择的用多线程、多进程,还有最近很火的Gevent,据说是基于一种新的概念,协程,不管什么程,总之好用就行了。写一个爬虫有一系列的东西需要处理,如果有一个好用的框架就会事半功倍。
在42qu.com源码里面,有个教主写的爬虫框架,很好用,是基于Gevent的,处理url采用和web开发类似的映射方法,教主还写了一个简单的html处理库,extract,虽没有Beautifulsoup那样强大,但是简单好用,基本能满足常见的要求。HTTP请求使用了requests,requests是个处理HTTP的库,用官方的话说“Requests is an ISC Licensed HTTP library, written in Python, for human beings.”,for human beings,好吧,废话不说,上代码:
这里我将会把所需代码都贴出来,并贴一个示例。由于windows平台貌似没有Gevent支持,我还改写了一个多线程版本,一并放出。
首先是extract:
#coding:utf-8
import re
def extract(begin, end, html):
if not html:
return ''
start = html.find(begin)
if start >= 0:
start += len(begin)
if end is not None:
end = html.find(end, start)
if end is None or end >= 0:
return html[start:end].strip()
def extract_all(begin, end, html):
return map(str.strip, _extract_all(begin, end, html))
def _extract_all(begin, end, html):
if not html:
return ''
result = []
from_pos = 0
while True:
start = html.find(begin, from_pos)
if start >= 0:
start += len(begin)
endpos = html.find(end, start)
if endpos >= 0:
result.append(html[start:endpos])
from_pos = endpos+len(end)
continue
break
return result
def line_strip(txt):
if not txt:
return ''
txt = txt.replace(' ', ' ').split('\n')
return '\n'.join(i for i in [i.strip() for i in txt] if i)
def extract_strip(begin, end, html):
if not html:
return ''
t = extract(begin, end, html)
if t:
return strip_line(t)
def extract_map(begin, end, html, func):
txt = []
result = []
prepos = None
preend = 0
len_end = len(end)
len_begin = len(begin)
while True:
if prepos is None:
pos = html.find(begin)
else:
pos = html.find(begin, prepos)
if pos >= 0:
end = html.find(end, pos)
if pos < 0 or end < 0:
result.append(html[preend:])
break
end = end+len_end
result.append(html[preend:pos])
tmp = func(html[pos:end])
if tmp:
result.append(tmp)
prepos = pos+len_begin
preend = end
return ''.join(result)
if __name__ == '__main__':
pass
然后是爬虫框架,下面是原版
#coding:utf-8
import _env
from gevent.queue import Empty, Queue
import gevent
import gevent.monkey
import requests
from urlparse import urlparse, parse_qs
import re
gevent.monkey.patch_all()
class Bot(object):
cookie = None
headers = {}
def __init__(self, route):
self.queue = Queue()
self.route = route
def _fetch(self):
queue = self.queue
timeout = self.timeout
route = self.route
while True:
try:
url = queue.get(timeout=timeout+10)
except Empty:
return
headers = self.headers
if self.cookie:
headers['Cookie'] = self.cookie
req = requests.get(url, timeout=timeout, headers=headers, proxies=self.proxies)
p = urlparse(req.url)
cls, args = route.match(p.path)
if cls:
o = cls(req)
r = o.get(*args)
if r:
for i in r:
if i:
queue.put(i)
def run(self, num=10, timeout=60, proxies={}, cookie=None):
self.proxies = proxies
self.timeout = timeout
self.cookie = cookie
for i in xrange(num):
g = gevent.spawn(self._fetch)
g.join()
def put(self, url):
self.queue.put(url)
class Route(object):
def __init__(self):
self.map = []
def match(self, url):
for r, f in self.map:
m = r.match(url)
if m:
return f, m.groups()
return None, None
def __call__(self, path):
if not path.endswith('$'):
path += '$'
re_path = re.compile(path)
def _(func):
self.map.append((re_path, func))
return func
return _
route = Route()
bot = Bot(route)
from extract import extract, extract_all
class Page(object):
def __init__(self, req):
p = urlparse(req.url)
req.arguments = parse_qs(p.query, 1)
self.req = req
self.html = req.content
def get_argument(self, name, default=None):
result = self.req.arguments.get(name, None)
if result is None:
return default
return result[0].encode('utf-8', 'ignore')
def extract(self, begin, end):
return extract(begin, end, self.html)
def extract_all(self, begin, end):
return extract_all(begin, end, self.html)
然后是线程版的,适用于windows平台或者未安装Gevent的情况:
#coding:utf-8
import _env
import re
import time
import requests
from Queue import Queue
from threading import Thread
from urlparse import urlparse, parse_qs
ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1'
class Bot(object):
cookie = None
headers = {}
def __init__(self, route):
self.queue = Queue()
self.route = route
def _fetch(self):
queue = self.queue
timeout = self.timeout
route = self.route
while True:
try:
url = queue.get(timeout=timeout+10)
except:
return
headers = self.headers
if self.cookie:
headers['Cookie'] = self.cookie
headers['User-Agent'] = ua
req = requests.get(url, timeout=timeout, headers=headers, proxies=self.proxies)
time.sleep(3)
p = urlparse(req.url)
cls, args = route.match(p.path)
if cls:
o = cls(req)
r = o.get(*args)
if r:
for i in r:
if i:
queue.put(i)
def run(self, num=6, timeout=10, proxies={}, cookie=cookie):
self.timeout = timeout
self.proxies = proxies
self.cookie = cookie
g = Thread()
g.setDaemon(1)
g.start()
for i in xrange(num):
g = Thread(target=self._fetch)
g.start()
#g.join()
def put(self, url):
self.queue.put(url)
class Route(object):
def __init__(self):
self.map = []
def match(self, url):
for r, f in self.map:
m = r.match(url)
if m:
return f, m.groups()
return None, None
def __call__(self, path):
if not path.endswith('$'):
path += '$'
re_path = re.compile(path)
def _(func):
self.map.append((re_path, func))
return func
return _
route = Route()
bot = Bot(route)
from extract import extract, extract_all
class Page(object):
def __init__(self, req):
p = urlparse(req.url)
req.arguments = parse_qs(p.query, 1)
self.req = req
self.html = req.content
def get_argument(self, name, default=None):
result = self.req.arguments.get(name, None)
if result is None:
return default
return result[0].encode('utf-8', 'ignore')
def extract(self, begin, end):
return extract(begin, end, self.html)
def extract_all(self, begin, end):
return extract_all(begin, end, self.html)
线程版和Gevent用法完全一样,下面是一个抓取豆瓣相册的例子:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import _env
import _bot
import os
import urllib
from os.path import join, exists
from _bot import bot, route, Page
from extract import extract, extract_all
url_base = 'http://img1.douban.com/view/photo/raw/public/p%s.jpg'
url_tmp = 'http://movie.douban.com/subject/3395373/photos?type=S&start=%s&sortby=vote&size=a&subtype=a'
f_base = '/home/lerry/movie'
#f_base = u'E:\Download\photo\蝙蝠侠'
f_tmp = join(f_base, 'p%s.jpg')
@route('/subject/3395373/photos')
class url_list(Page):
def get(self):
page = self.get_argument('start')
page = int(page)
if page == 0:
for i in xrange(1, 41):
start = i*40
url = url_tmp % start
yield url
li = self.extract('class="poster-col4 clearfix"', '</ul>')
for i in extract_all('<li>', '</li>', li):
path = extract('a href="', '/">', i)
if not path:
continue
id = path[37:]
fpath = f_tmp % id
if exists(fpath):
continue
url = url_base % id
yield url
@route('/view/photo/raw/public/(.+)')
class single(Page):
def get(self, arg):
save_pic(self.html, arg)
def save_pic(content, fname):
fpath = join(f_base, fname)
f = open(fpath, 'wb')
f.write(content)
f.close()
print fname, 'saved'
if __name__ == '__main__':
bot.put('http://movie.douban.com/subject/3395373/photos?type=S&start=0&sortby=vote&size=a&subtype=a')
bot.run()
作者: Lerry
文章标题:python 爬虫框架
发表时间:2012-09-15
版权说明:CC BY-NC-ND 4.0 DEED