diff --git a/Unit6/general_scraper.py b/Unit6/general_scraper.py new file mode 100755 index 0000000..d06f3a1 --- /dev/null +++ b/Unit6/general_scraper.py @@ -0,0 +1,32 @@ +import urllib.request +from urllib.parse import urljoin +from bs4 import BeautifulSoup +import os + +base_url = "https://apod.nasa.gov/apod/archivepix.html" +scraper_test_url = "http://web.mit.edu/jesstess/www/scraper_test/" +download_directory = "apod_pictures" + +to_visit = set((base_url,)) +visited = set() + +while to_visit: + # Pick a link to visit + current_page = to_visit.pop() + # Visit the link + print("visiting: ", current_page) + visited.add(current_page) + content = urllib.request.urlopen(current_page).read() + # Extract any new links from that page + for link in BeautifulSoup(content, "lxml").findAll("a"): + absolute_link = urljoin(current_page, link["href"]) + if absolute_link not in visited: + to_visit.add(absolute_link) + else: + print("Already visited:", absolute_link) + # Download any images on the page + for img in BeautifulSoup(content, "lxml").findAll("img"): + img_href = urljoin(current_page, img["src"]) + print("Downloading image:", img_href) + img_name = img_href.split("/")[-1] + urllib.request.urlretrieve(img_href, os.path.join(download_directory, img_name)) diff --git a/Unit6/scraper.py b/Unit6/scraper.py new file mode 100755 index 0000000..c383d96 --- /dev/null +++ b/Unit6/scraper.py @@ -0,0 +1,33 @@ +# From the archive, follow each link, find an image in that page, download the image + +# Concepts: +# 1. Downloading stuff => urllib +# 2. Parsing stuff out of HTML => BeautifulSoup + +# Download the index page +import urllib.request +from urllib.parse import urljoin +from bs4 import BeautifulSoup +import os + +base_url = "https://apod.nasa.gov/apod/archivepix.html" +download_directory = "apod_pictures" +content = urllib.request.urlopen(base_url).read() + +image_count = read("How many images to download?\n> ") +img_count = 0 + +# For each link on the Index page: +for link in BeautifulSoup(content, "lxml").findAll("a"): + print("Following link:", link) + href = urljoin(base_url, link["href"]) + # Follow the link and pull down the image on that linked page + page = urllib.request.urlopen(href).read() + for img in BeautifulSoup(page, "lxml").findAll("img"): + img_href = urljoin(href, img["src"]) + print("Downloading image:", img_href) + img_name = img_href.split("/")[-1] + urllib.request.urlretrieve(img_href, os.path.join(download_directory, img_name)) + img_count += 1 + if img_count == image_count: + print("Finished Downloading " + str(img_count) + " images.")