import logging from url import construct_url from proxy import get_data_with_proxy, NoMoreProxiesException logger = logging.getLogger(__name__) def _receive_links(title, connection): url = construct_url(title) result = get_data_with_proxy(url, connection) # This is basically because we don't know the page ID. for k, page_data in result["query"]["pages"].items(): cursor = connection.cursor() if(not "links" in page_data): destination_title = page_data["title"].replace(" ", "_") # avoid 1-loops if(destination_title == title): pass cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title)) yield destination_title else: for destination in page_data["links"]: destination_title = destination["title"].replace(" ", "_") cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title)) yield destination_title connection.commit() def receive_links(title, connection): return list(_receive_links(title, connection)) def receive_link_graph(title, connection, depth): if(depth < 0): # end of recursion return cursor = connection.cursor() cursor.execute("SELECT COUNT(source) FROM links WHERE source=?", (title,)) if(cursor.fetchone()[0] != 0): # we fetched that title already return logger.info("fetching links for {}".format(title)) for link in _receive_links(title, connection): try: receive_link_graph(link, connection, depth - 1) except NoMoreProxiesException as e: logger.exception("All proxies are blocked") # Wikipedia blocked all our proxies. # Retry later, so we have to store our list that is still to fetch. cursor = connection.cursor() cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(?, ?)", (link, depth - 1)) connection.commit()