用python批量下载51voa的mp3和文本
这个小脚本是前一段写的,当时有同学让我帮她下载voa的mp3,我就弄了个脚本,能批量下载,再次运行不会重复下载,就是说,如果网站有更新了,再次运行即可,不会重复下载。主要用到了BeautifulSoup解析网页。今天在v2ex上看到有人问,就发出来吧。
```python #!/usr/bin/python # -*- coding: utf-8 -*- ''' For: by Lerry https://lerry.me Start from 2012-01-16 20:09 Last edit at 2012-01-16 20:09 ''' import os import urllib from BeautifulSoup import BeautifulSoup base_dir = os.path.join(os.getcwd(),'voa') if not os.path.exists(base_dir): os.mkdir(base_dir) def get_url_list(url): '''get page url from list''' content = urllib.urlopen(url).read() temp = BeautifulSoup(content.decode('gbk','ignore')) urls = temp.findAll('a') url_list = [] for i in urls: i = i['href'] if i.startswith('/VOA_Standard_English/') and i.endswith('html'): url_list.append('http://www.51voa.com'+i) return list(set(url_list)) def parse_page(url): content = urllib.urlopen(url).read().decode('gbk','ignore') temp = BeautifulSoup(content) title = '_'.join(str(temp.html.title).split()[4:-1]) temp_date = str(temp.find("span", { "class" : "datetime" }))[23:-7] pub_date = '_'.join(temp_date.replace(',','').split()) urls = temp.findAll('a') mp3_url = '' for i in urls: i = i['href'] if i.startswith('/path.asp?url='): mp3_url = 'http://www.51voa.com'+i text = temp.find("div", "articleBody" ).findAll('p') mp3_dir = os.path.join(base_dir, title) filename = os.path.join(mp3_dir, mp3_url.split('/')[-1][:-4]) if not os.path.exists(mp3_dir): os.mkdir(mp3_dir) if not os.path.exists(filename+'.txt'): f = open(filename+'.txt', 'w') for i in text: f.write(str(i)+'\n') f.close() print mp3_url, 'Text saved' if not os.path.exists(filename+'.mp3'): down_mp3(mp3_url, filename+'.mp3') def down_mp3(mp3_url, filename): try: open(filename, 'wb').write(urllib.urlopen(mp3_url).read()) print 'Downloaded' except: print mp3_url,' not downloaded' def main(): root = 'http://www.51voa.com/VOA_Standard_English/' url_list = get_url_list(root) #print len(url_list) for i in url_list: parse_page(i) #break main() ```
之前的版本已经不能用了,经常有网友发邮件询问,就更新了下,2013年6月22
```python #!/usr/bin/python # -*- coding: utf-8 -*- import _env import os from os.path import exists, join, dirname import urllib from lerrylib.Bot import bot, route, Page from lerrylib.extract import extract, extract_all mp3_dir = join(os.getcwd(), 'mp3') url = 'http://www.51voa.com/VOA_Standard_English/' url_base = 'http://www.51voa.com' proxies = { "http": "113.11.199.27:8123", } def down_mp3(mp3_url, filename): try: open(filename, 'wb').write(urllib.urlopen(mp3_url).read()) print mp3_url, 'Downloaded' except: print mp3_url,' not downloaded' @route('/VOA_Standard_English/') class url_list(Page): def get(self): #print self.html li = self.extract('
', '
') for i in extract_all('[') the_root = join(mp3_dir, title) if not exists(the_root): os.mkdir(the_root) mp3 = 'http://down.51voa.com' + self.extract('Player("', '");') txt = self.extract('>
', '') if txt: f = open(join(the_root, title+'.txt'), 'w') f.write(txt) f.close() down_mp3(mp3, join(the_root, title+'.mp3')) if __name__ == '__main__': bot.put(url) bot.run(num=6, timeout=15) if not exists(mp3_dir): os.mkdir(mp3_dir) ```](', ' "self.extract('"description"")