import logging from collections import deque from cfg import config from url import construct_url from proxy import get_data_with_proxy, NoMoreProxiesException from db_util import get_page_id, get_page_title logger = logging.getLogger(__name__) def ignore_title(title): ignore_starters = ["Help:" , "Wikipedia:" , "Template:" , "Template_talk:" , "Category:" ] for ignore in ignore_starters: if(title.startswith(ignore)): return True return False def _receive_links(page, connection): title = get_page_title(page, connection) url = construct_url(title) result = get_data_with_proxy(url, connection) # This is basically because we don't know the page ID. for k, page_data in result["query"]["pages"].items(): cursor = connection.cursor() if(not "links" in page_data): destination_title = page_data["title"].replace(" ", "_") # avoid 1-loops if(destination_title == title): continue if(ignore_title(destination_title)): continue destination = get_page_id(destination_title, connection) cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (page, destination)) yield destination else: for destination in page_data["links"]: destination_title = destination["title"].replace(" ", "_") if(ignore_title(destination_title)): continue destination = get_page_id(destination_title, connection) cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (page, destination)) yield destination connection.commit() def receive_links(title, connection): return list(_receive_links(title, connection)) def receive_link_graph(title, connection, depth): page = get_page_id(title, connection) do_receive_link_graph(page, connection, depth, fetch_missing=True) cursor = connection.cursor() cursor.execute("SELECT COUNT(page) FROM failed_to_fetch") if(cursor.fetchone()[0]): do_receive_link_graph(page, connection, depth, fetch_missing=True) def do_receive_link_graph(page, connection, depth, fetch_missing=False): if(depth < 0): # end of recursion return logger.info("do_receive_link_graph(%d, , %d)" % (page, depth)) cursor = connection.cursor() # Fetch the missing links. if(fetch_missing): delete_cursor = connection.cursor() cursor.execute('''SELECT failed_to_fetch.depth, failed_to_fetch.page FROM failed_to_fetch ''') for d, p in cursor: do_receive_link_graph(p, connection, d, fetch_missing=False) delete_cursor.execute("DELETE FROM failed_to_fetch WHERE page=%s", (p,)) cursor = connection.cursor() cursor.execute("SELECT COUNT(source) FROM links WHERE source=%s", (page,)) if(cursor.fetchone()[0] != 0): # we fetched that title already return logger.info("fetching links for {}".format(page)) for link in _receive_links(page, connection): try: do_receive_link_graph(link, connection, depth - 1) except NoMoreProxiesException as e: logger.exception("All proxies are blocked") # Wikipedia blocked all our proxies. # Retry later, so we have to store our list that is still to fetch. cursor = connection.cursor() cursor.execute("INSERT INTO failed_to_fetch(page, depth) VALUES(%s, %s)", (link, depth - 1)) connection.commit()