Article31 Mar 2012

用python批量下载51voa的mp3和文本

#Python#爬虫2,410 reads

这个小脚本是前一段写的,当时有同学让我帮她下载voa的mp3,我就弄了个脚本,能批量下载,再次运行不会重复下载,就是说,如果网站有更新了,再次运行即可,不会重复下载。主要用到了BeautifulSoup解析网页。

Codepython
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
For: by Lerry https://lerry.me
Start from 2012-01-16 20:09
Last edit at 2012-01-16 20:09
'''
import os
import urllib
from BeautifulSoup import BeautifulSoup

base_dir = os.path.join(os.getcwd(),'voa')
if not os.path.exists(base_dir):
    os.mkdir(base_dir)

def get_url_list(url):
    '''get page url from list'''
    content = urllib.urlopen(url).read()
    temp = BeautifulSoup(content.decode('gbk','ignore'))
    urls = temp.findAll('a')
    url_list = []
    for i in urls:
        i = i['href']
        if i.startswith('/VOA_Standard_English/') and i.endswith('html'):
            url_list.append('http://www.51voa.com'+i)
    return list(set(url_list))

def parse_page(url):
    content = urllib.urlopen(url).read().decode('gbk','ignore')
    temp = BeautifulSoup(content)
    title = '_'.join(str(temp.html.title).split()[4:-1])
    temp_date = str(temp.find("span", { "class" : "datetime" }))[23:-7]
    pub_date = '_'.join(temp_date.replace(',','').split())
    urls = temp.findAll('a')
    mp3_url = ''
    for i in urls:
        i = i['href']
        if i.startswith('/path.asp?url='):
            mp3_url = 'http://www.51voa.com'+i

    text = temp.find("div", "articleBody" ).findAll('p')
    mp3_dir = os.path.join(base_dir, title)
    filename = os.path.join(mp3_dir, mp3_url.split('/')[-1][:-4])

    if not os.path.exists(mp3_dir):
        os.mkdir(mp3_dir)

    if not os.path.exists(filename+'.txt'):
        f = open(filename+'.txt', 'w')
        for i in text:
            f.write(str(i)+'\n')
        f.close()
        print mp3_url, 'Text saved'

    if not os.path.exists(filename+'.mp3'):
        down_mp3(mp3_url, filename+'.mp3')

def down_mp3(mp3_url, filename):
    try:
        open(filename, 'wb').write(urllib.urlopen(mp3_url).read())
        print 'Downloaded'
    except:
        print mp3_url,' not downloaded'

def main():
    root = 'http://www.51voa.com/VOA_Standard_English/'
    url_list = get_url_list(root)
    #print len(url_list)
    for i in url_list:
        parse_page(i)
    #break

if __name__ == '__main__':
    main()

之前的版本已经不能用了,经常有网友发邮件询问,就更新了下,2013年6月22

Codepython
#!/usr/bin/python
# -*- coding: utf-8 -*-
import _env
import os
from os.path import exists, join, dirname
import urllib
from lerrylib.Bot import bot, route, Page
from lerrylib.extract import extract, extract_all

mp3_dir = join(os.getcwd(), 'mp3')
url = 'http://www.51voa.com/VOA_Standard_English/'
url_base = 'http://www.51voa.com'
proxies = { "http": "113.11.199.27:8123", }

def down_mp3(mp3_url, filename):
    try:
        open(filename, 'wb').write(urllib.urlopen(mp3_url).read())
        print mp3_url, 'Downloaded'
    except:
        print mp3_url,' not downloaded'

@route('/VOA_Standard_English/')
class url_list(Page):
    def get(self):
        #print self.html
        li = self.extract('<p>', '</p>')
        for i in extract_all('['):
            the_root = join(mp3_dir, title)
            if not exists(the_root):
                os.mkdir(the_root)
            mp3 = 'http://down.51voa.com' + self.extract('Player("', '");')
            txt = self.extract('>  ', '')
            if txt:
                f = open(join(the_root, title+'.txt'), 'w')
                f.write(txt)
                f.close()
            down_mp3(mp3, join(the_root, title+'.mp3'))

if __name__ == '__main__':
    bot.put(url)
    bot.run(num=6, timeout=15)
    if not exists(mp3_dir):
        os.mkdir(mp3_dir)