用python批量下载51voa的mp3和文本

这个小脚本是前一段写的,当时有同学让我帮她下载voa的mp3,我就弄了个脚本,能批量下载,再次运行不会重复下载,就是说,如果网站有更新了,再次运行即可,不会重复下载。主要用到了BeautifulSoup解析网页。今天在v2ex上看到有人问,就发出来吧。

#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
For:
by Lerry  http://lerry.org
Start from 2012-01-16 20:09
Last edit at 2012-01-16 20:09
'''
import os
import urllib
from BeautifulSoup import BeautifulSoup

base_dir = os.path.join(os.getcwd(),'voa')
if not os.path.exists(base_dir):
    os.mkdir(base_dir)

def get_url_list(url):
    '''get page url from list'''
    content = urllib.urlopen(url).read()
    temp = BeautifulSoup(content.decode('gbk','ignore'))
    urls = temp.findAll('a')
    url_list = []
    for i in urls:
        i = i['href']
        if i.startswith('/VOA_Standard_English/') and i.endswith('html'):
            url_list.append('http://www.51voa.com'+i)
    return list(set(url_list))

def parse_page(url):
    content = urllib.urlopen(url).read().decode('gbk','ignore')
    temp = BeautifulSoup(content)
    title = '_'.join(str(temp.html.title).split()[4:-1])
    temp_date =  str(temp.find("span", { "class" : "datetime" }))[23:-7]
    pub_date = '_'.join(temp_date.replace(',','').split())
    urls = temp.findAll('a')
    mp3_url = ''
    for i in urls:
        i = i['href']
        if i.startswith('/path.asp?url='):
            mp3_url = 'http://www.51voa.com'+i
    text = temp.find("div", "articleBody" ).findAll('p')
    mp3_dir = os.path.join(base_dir, title)
    filename = os.path.join(mp3_dir, mp3_url.split('/')[-1][:-4])
    if not os.path.exists(mp3_dir):
        os.mkdir(mp3_dir)
    if not os.path.exists(filename+'.txt'):
        f = open(filename+'.txt', 'w')
        for i in text:
            f.write(str(i)+'\n')
        f.close()
        print mp3_url, 'Text saved'
    if not os.path.exists(filename+'.mp3'):
        down_mp3(mp3_url, filename+'.mp3')       

def down_mp3(mp3_url, filename):
    try:
        open(filename, 'wb').write(urllib.urlopen(mp3_url).read())
        print 'Downloaded'
    except:
        print mp3_url,' not downloaded'

def main():
    root = 'http://www.51voa.com/VOA_Standard_English/' 
    url_list = get_url_list(root)
    #print len(url_list)
    for i in url_list:
        parse_page(i)
        #break
main()

之前的版本已经不能用了,经常有网友发邮件询问,就更新了下,2013年6月22

#!/usr/bin/python
# -*- coding: utf-8 -*-
import _env
import os
from os.path import exists, join, dirname
import urllib
from lerrylib.Bot import bot, route, Page
from lerrylib.extract import extract, extract_all

mp3_dir = join(os.getcwd(), 'mp3')


url = 'http://www.51voa.com/VOA_Standard_English/'
url_base = 'http://www.51voa.com'

proxies = {
  "http": "113.11.199.27:8123",
  }


def down_mp3(mp3_url, filename):
    try:
        open(filename, 'wb').write(urllib.urlopen(mp3_url).read())
        print mp3_url, 'Downloaded'
    except:
        print mp3_url,' not downloaded'  

@route('/VOA_Standard_English/')
class url_list(Page):
    def get(self):
        #print self.html
        li = self.extract('<div id="list">', '</div>')
        for i in extract_all('<a href="', '" target', li):
            url = (url_base+i).strip()
            print url
            yield url

@route('/VOA_Standard_English/(.+)')
class url_single(Page):
    def get(self, url):
        title = self.extract('"description" content="VOA Standard English, ', '" />')
        the_root = join(mp3_dir, title)
        if not exists(the_root):
            os.mkdir(the_root)
        mp3 = 'http://down.51voa.com' + self.extract('Player("', '");')
        txt = self.extract('><BR><BR>', '</div>')
        if txt:
            f = open(join(the_root, title+'.txt'), 'w')
            f.write(txt)
            f.close()
        down_mp3(mp3, join(the_root, title+'.mp3'))


if __name__ == '__main__':
    bot.put(url)
    bot.run(num=6, timeout=15)
    if not exists(mp3_dir):
        os.mkdir(mp3_dir)
2,221 views, since 2012-03-31