scientific-programming-exer.../exam/ex01/receive.py

97 lines
2.8 KiB
Python
Raw Permalink Normal View History

2019-02-02 10:18:57 +00:00
import logging
from url import construct_url
2019-02-02 15:06:57 +00:00
from proxy import get_data_with_proxy, NoMoreProxiesException
2019-02-19 13:16:22 +00:00
from db_util import get_page_id
2019-02-02 10:18:57 +00:00
logger = logging.getLogger(__name__)
2019-02-15 11:46:32 +00:00
def ignore_title(title):
ignore_starters = ["Help:"
, "Wikipedia:"
, "Template:"
, "Template_talk:"
, "Category:"
]
for ignore in ignore_starters:
if(title.startswith(ignore)):
return True
return False
2019-02-02 10:18:57 +00:00
def _receive_links(title, connection):
url = construct_url(title)
2019-02-19 13:16:22 +00:00
source = get_page_id(title, connection)
2019-02-02 10:18:57 +00:00
result = get_data_with_proxy(url, connection)
# This is basically because we don't know the page ID.
for k, page_data in result["query"]["pages"].items():
cursor = connection.cursor()
if(not "links" in page_data):
destination_title = page_data["title"].replace(" ", "_")
# avoid 1-loops
if(destination_title == title):
2019-02-15 11:46:32 +00:00
continue
if(ignore_title(title)):
continue
2019-02-19 13:16:22 +00:00
destination = get_page_id(destination_title, connection)
cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (source, destination))
2019-02-02 10:18:57 +00:00
yield destination_title
else:
for destination in page_data["links"]:
2019-02-15 11:46:32 +00:00
if(ignore_title(title)):
continue
2019-02-02 10:18:57 +00:00
destination_title = destination["title"].replace(" ", "_")
2019-02-19 13:16:22 +00:00
destination = get_page_id(destination_title, connection)
cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (source, destination))
2019-02-02 10:18:57 +00:00
yield destination_title
connection.commit()
def receive_links(title, connection):
return list(_receive_links(title, connection))
2019-02-19 13:16:22 +00:00
def receive_link_graph(title, connection, depth, fetch_missing=True):
cursor = connection.cursor()
# Fetch the missing links.
if(fetch_missing):
delete_cursor = connection.cursor()
cursor.execute('''SELECT pages.title, failed_to_fetch.depth, failed_to_fetch.page
FROM failed_to_fetch
LEFT JOIN pages ON pages.page_id=failed_to_fetch.page''')
for t, d, p in cursor:
receive_link_graph(t, connection, d, fetch_missing=False)
delete_cursor.execute("DELETE FROM failed_to_fetch WHERE page=%s", (p,))
2019-02-02 10:18:57 +00:00
if(depth < 0):
# end of recursion
return
2019-02-19 13:16:22 +00:00
page = get_page_id(title, connection)
2019-02-02 10:18:57 +00:00
cursor = connection.cursor()
2019-02-19 13:16:22 +00:00
cursor.execute("SELECT COUNT(source) FROM links WHERE source=%s", (page,))
2019-02-02 10:18:57 +00:00
if(cursor.fetchone()[0] != 0):
# we fetched that title already
return
logger.info("fetching links for {}".format(title))
for link in _receive_links(title, connection):
2019-02-02 15:06:57 +00:00
try:
receive_link_graph(link, connection, depth - 1)
except NoMoreProxiesException as e:
logger.exception("All proxies are blocked")
# Wikipedia blocked all our proxies.
# Retry later, so we have to store our list that is still to fetch.
cursor = connection.cursor()
2019-02-19 13:16:22 +00:00
failed_page = get_page_id(link, connection)
cursor.execute("INSERT INTO failed_to_fetch(page, depth) VALUES(%s, %s)", (failed_page, depth - 1))
2019-02-02 15:06:57 +00:00
connection.commit()
2019-02-02 10:18:57 +00:00