这个小脚本是前一段写的,当时有同学让我帮她下载voa的mp3,我就弄了个脚本,能批量下载,再次运行不会重复下载,就是说,如果网站有更新了,再次运行即可,不会重复下载。主要用到了BeautifulSoup解析网页。今天在v2ex上看到有人问,就发出来吧。
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
For:
by Lerry http://lerry.org
Start from 2012-01-16 20:09
Last edit at 2012-01-16 20:09
'''
import os
import urllib
from BeautifulSoup import BeautifulSoup
base_dir = os.path.join(os.getcwd(),'voa')
if not os.path.exists(base_dir):
os.mkdir(base_dir)
def get_url_list(url):
'''get page url from list'''
content = urllib.urlopen(url).read()
temp = BeautifulSoup(content.decode('gbk','ignore'))
urls = temp.findAll('a')
url_list = []
for i in urls:
i = i['href']
if i.startswith('/VOA_Standard_English/') and i.endswith('html'):
url_list.append('http://www.51voa.com'+i)
return list(set(url_list))
def parse_page(url):
content = urllib.urlopen(url).read().decode('gbk','ignore')
temp = BeautifulSoup(content)
title = '_'.join(str(temp.html.title).split()[4:-1])
temp_date = str(temp.find("span", { "class" : "datetime" }))[23:-7]
pub_date = '_'.join(temp_date.replace(',','').split())
urls = temp.findAll('a')
mp3_url = ''
for i in urls:
i = i['href']
if i.startswith('/path.asp?url='):
mp3_url = 'http://www.51voa.com'+i
text = temp.find("div", "articleBody" ).findAll('p')
mp3_dir = os.path.join(base_dir, title)
filename = os.path.join(mp3_dir, mp3_url.split('/')[-1][:-4])
if not os.path.exists(mp3_dir):
os.mkdir(mp3_dir)
if not os.path.exists(filename+'.txt'):
f = open(filename+'.txt', 'w')
for i in text:
f.write(str(i)+'\n')
f.close()
print mp3_url, 'Text saved'
if not os.path.exists(filename+'.mp3'):
down_mp3(mp3_url, filename+'.mp3')
def down_mp3(mp3_url, filename):
try:
open(filename, 'wb').write(urllib.urlopen(mp3_url).read())
print 'Downloaded'
except:
print mp3_url,' not downloaded'
def main():
root = 'http://www.51voa.com/VOA_Standard_English/'
url_list = get_url_list(root)
#print len(url_list)
for i in url_list:
parse_page(i)
#break
main()
之前的版本已经不能用了,经常有网友发邮件询问,就更新了下,2013年6月22
#!/usr/bin/python
# -*- coding: utf-8 -*-
import _env
import os
from os.path import exists, join, dirname
import urllib
from lerrylib.Bot import bot, route, Page
from lerrylib.extract import extract, extract_all
mp3_dir = join(os.getcwd(), 'mp3')
url = 'http://www.51voa.com/VOA_Standard_English/'
url_base = 'http://www.51voa.com'
proxies = {
"http": "113.11.199.27:8123",
}
def down_mp3(mp3_url, filename):
try:
open(filename, 'wb').write(urllib.urlopen(mp3_url).read())
print mp3_url, 'Downloaded'
except:
print mp3_url,' not downloaded'
@route('/VOA_Standard_English/')
class url_list(Page):
def get(self):
#print self.html
li = self.extract('<div id="list">', '</div>')
for i in extract_all('<a href="', '" target', li):
url = (url_base+i).strip()
print url
yield url
@route('/VOA_Standard_English/(.+)')
class url_single(Page):
def get(self, url):
title = self.extract('"description" content="VOA Standard English, ', '" />')
the_root = join(mp3_dir, title)
if not exists(the_root):
os.mkdir(the_root)
mp3 = 'http://down.51voa.com' + self.extract('Player("', '");')
txt = self.extract('><BR><BR>', '</div>')
if txt:
f = open(join(the_root, title+'.txt'), 'w')
f.write(txt)
f.close()
down_mp3(mp3, join(the_root, title+'.mp3'))
if __name__ == '__main__':
bot.put(url)
bot.run(num=6, timeout=15)
if not exists(mp3_dir):
os.mkdir(mp3_dir)