-
Notifications
You must be signed in to change notification settings - Fork 9
/
checklinks.py
95 lines (79 loc) · 3 KB
/
checklinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import html.parser
import urllib.parse
import urllib.request
import json
# Checks for broken links, preemptively saving all linked pages
# to the Internet Archive's Wayback Machine.
# Requires Python 3.
self_dir = os.path.dirname(__file__)
html_path = os.path.join(self_dir, "index.html")
def extract_links_from_html_file(path):
class LinkExtractParser(html.parser.HTMLParser):
def __init__(self):
super().__init__()
self.links = []
def handle_starttag(self, tag, attrs):
if tag == "a":
for (k, v) in attrs:
if k == "href":
self.links.append(v)
parser = LinkExtractParser()
with open(html_path) as file:
parser.feed(file.read())
def is_http_link(href):
return href.startswith("https:") or href.startswith("http:")
# Drop duplicates, filter out non-http-links, sort
links = list(filter(is_http_link, sorted(set(parser.links))))
return links
def try_fetch_webpage(url):
try:
response = urllib.request.urlopen(url)
except urllib.error.URLError as e:
if hasattr(e, "code"):
return {"error": str(e.code)}
elif hasattr(e, "reason"):
return {"error": e.reason}
else:
if response.geturl() != url:
return {"redirect": response.geturl()}
else:
return {"success": True}
def get_wayback_machine_archived_page(url):
api_url = "https://archive.org/wayback/available?url=" + url
response = urllib.request.urlopen(api_url)
data = json.loads(response.read())
if "closest" in data["archived_snapshots"]:
if data["archived_snapshots"]["closest"]["available"]:
return data["archived_snapshots"]["closest"]["url"]
return None
def save_in_wayback_machine(url):
api_url = "https://web.archive.org/save/" + url
response = urllib.request.urlopen(api_url)
if "Content-Location" in response.info() and response.info()["Content-Location"].startswith("/web/"):
return "https://web.archive.org" + response.info()["Content-Location"]
return None
urls = extract_links_from_html_file(html_path)
for url in urls:
result = try_fetch_webpage(url)
if "success" in result:
print("OK {}".format(url))
if get_wayback_machine_archived_page(url) is None:
try:
wayback_url = save_in_wayback_machine(url)
print(" Archived as: {}".format(wayback_url))
except:
pass
elif "redirect" in result:
print("REDIRECT {}".format(url))
print(" Redirects to: {}".format(result["redirect"]))
elif "error" in result:
print("BROKEN {}".format(url))
print(" Reason: {}".format(result["error"]))
if "success" not in result:
try:
wayback_url = get_wayback_machine_archived_page(url)
if wayback_url is not None:
print(" Archived copy: {}".format(wayback_url))
except:
pass