Podcast Scraping: Radiolab

Radiolab seems to make their page out of javascript. that made it slightly more annoying to find the right download link, but otherwise it's straight forward.

import time
import requests
from bs4 import BeautifulSoup
import os.path
def download(href, title, extension="mp3", dirname='.'):
    print(href, title, extension, dirname)
    filename = "%s.%s" % (title, extension)
    filename = filename.replace("/", "-")
    # todo, path management
    local_filename = os.path.join(dirname, filename)
    if not os.path.exists(dirname):
        print("making dir %s" % dirname)
        os.makedirs(dirname)
    local_filename = os.path.join(dirname, filename)
    r = requests.get(href, stream=True)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
    return local_filename
archive_page = requests.get("http://www.radiolab.org/archive")
a_soup = BeautifulSoup(archive_page.content, 'html.parser')
for ep_row in a_soup.find_all('div', attrs={"class": "info-overlay"}):
    ep_page_link = ep_row.find('a', attrs={"class": "read-more"})
    print (ep_page_link.attrs['href'])
    ep_page = requests.get(ep_page_link.attrs['href'])
    soup = BeautifulSoup(ep_page.content, 'html.parser')
    link = soup.find('div', attrs={"class": "player_element"})
    href = link.attrs['data-download']
    title = soup.find('h2', attrs={"class": "title"})
    meta = soup.find('div', attrs={"class": "seanum-epnum"})
    season, episode = meta.text.split("|")
    season = season.replace("Season", "").strip()
    episode = episode.replace("Episode", "").strip()
    download(href, "%s. %s" % (episode, title), dirname="Season_%s" % season)

27th January 2018

Comments and Messages

I won't ever give out your email address. I don't publish comments but if you'd like to write to me then you could use this form.