2019-02-02 10:18:57 +00:00
|
|
|
import logging
|
2019-02-25 12:07:52 +00:00
|
|
|
from collections import deque
|
|
|
|
|
|
|
|
from cfg import config
|
2019-02-02 10:18:57 +00:00
|
|
|
from url import construct_url
|
2019-02-02 15:06:57 +00:00
|
|
|
from proxy import get_data_with_proxy, NoMoreProxiesException
|
2019-02-25 12:07:52 +00:00
|
|
|
from db_util import get_page_id, get_page_title
|
2019-02-02 10:18:57 +00:00
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2019-02-15 11:46:32 +00:00
|
|
|
def ignore_title(title):
|
|
|
|
ignore_starters = ["Help:"
|
|
|
|
, "Wikipedia:"
|
|
|
|
, "Template:"
|
|
|
|
, "Template_talk:"
|
|
|
|
, "Category:"
|
|
|
|
]
|
|
|
|
for ignore in ignore_starters:
|
|
|
|
if(title.startswith(ignore)):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2019-02-25 12:07:52 +00:00
|
|
|
def _receive_links(page, connection):
|
|
|
|
title = get_page_title(page, connection)
|
2019-02-02 10:18:57 +00:00
|
|
|
url = construct_url(title)
|
|
|
|
|
|
|
|
result = get_data_with_proxy(url, connection)
|
2019-02-25 12:07:52 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
2019-02-02 10:18:57 +00:00
|
|
|
# This is basically because we don't know the page ID.
|
|
|
|
for k, page_data in result["query"]["pages"].items():
|
|
|
|
cursor = connection.cursor()
|
|
|
|
if(not "links" in page_data):
|
|
|
|
destination_title = page_data["title"].replace(" ", "_")
|
|
|
|
# avoid 1-loops
|
|
|
|
if(destination_title == title):
|
2019-02-15 11:46:32 +00:00
|
|
|
continue
|
2019-02-25 12:07:52 +00:00
|
|
|
if(ignore_title(destination_title)):
|
2019-02-15 11:46:32 +00:00
|
|
|
continue
|
2019-02-19 13:16:22 +00:00
|
|
|
destination = get_page_id(destination_title, connection)
|
2019-02-25 12:07:52 +00:00
|
|
|
cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (page, destination))
|
|
|
|
yield destination
|
2019-02-02 10:18:57 +00:00
|
|
|
|
|
|
|
else:
|
|
|
|
for destination in page_data["links"]:
|
|
|
|
destination_title = destination["title"].replace(" ", "_")
|
2019-02-25 12:07:52 +00:00
|
|
|
if(ignore_title(destination_title)):
|
|
|
|
continue
|
2019-02-19 13:16:22 +00:00
|
|
|
destination = get_page_id(destination_title, connection)
|
2019-02-25 12:07:52 +00:00
|
|
|
cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (page, destination))
|
|
|
|
yield destination
|
2019-02-02 10:18:57 +00:00
|
|
|
connection.commit()
|
|
|
|
|
|
|
|
def receive_links(title, connection):
|
|
|
|
return list(_receive_links(title, connection))
|
|
|
|
|
|
|
|
|
2019-02-25 12:07:52 +00:00
|
|
|
def receive_link_graph(title, connection, depth):
|
|
|
|
page = get_page_id(title, connection)
|
|
|
|
do_receive_link_graph(page, connection, depth, fetch_missing=True)
|
|
|
|
|
|
|
|
cursor = connection.cursor()
|
|
|
|
cursor.execute("SELECT COUNT(page) FROM failed_to_fetch")
|
|
|
|
if(cursor.fetchone()[0]):
|
|
|
|
do_receive_link_graph(page, connection, depth, fetch_missing=True)
|
|
|
|
|
|
|
|
def do_receive_link_graph(page, connection, depth, fetch_missing=False):
|
|
|
|
if(depth < 0):
|
|
|
|
# end of recursion
|
|
|
|
return
|
|
|
|
logger.info("do_receive_link_graph(%d, <connection>, %d)" % (page, depth))
|
2019-02-19 13:16:22 +00:00
|
|
|
cursor = connection.cursor()
|
|
|
|
|
|
|
|
# Fetch the missing links.
|
|
|
|
if(fetch_missing):
|
|
|
|
delete_cursor = connection.cursor()
|
2019-02-25 12:07:52 +00:00
|
|
|
cursor.execute('''SELECT failed_to_fetch.depth, failed_to_fetch.page
|
2019-02-19 13:16:22 +00:00
|
|
|
FROM failed_to_fetch
|
2019-02-25 12:07:52 +00:00
|
|
|
''')
|
|
|
|
for d, p in cursor:
|
|
|
|
do_receive_link_graph(p, connection, d, fetch_missing=False)
|
2019-02-19 13:16:22 +00:00
|
|
|
delete_cursor.execute("DELETE FROM failed_to_fetch WHERE page=%s", (p,))
|
|
|
|
|
|
|
|
|
|
|
|
|
2019-02-02 10:18:57 +00:00
|
|
|
cursor = connection.cursor()
|
2019-02-19 13:16:22 +00:00
|
|
|
cursor.execute("SELECT COUNT(source) FROM links WHERE source=%s", (page,))
|
2019-02-02 10:18:57 +00:00
|
|
|
if(cursor.fetchone()[0] != 0):
|
|
|
|
# we fetched that title already
|
|
|
|
return
|
|
|
|
|
2019-02-25 12:07:52 +00:00
|
|
|
logger.info("fetching links for {}".format(page))
|
2019-02-02 10:18:57 +00:00
|
|
|
|
2019-02-25 12:07:52 +00:00
|
|
|
for link in _receive_links(page, connection):
|
2019-02-02 15:06:57 +00:00
|
|
|
try:
|
2019-02-25 12:07:52 +00:00
|
|
|
do_receive_link_graph(link, connection, depth - 1)
|
2019-02-02 15:06:57 +00:00
|
|
|
except NoMoreProxiesException as e:
|
|
|
|
logger.exception("All proxies are blocked")
|
|
|
|
# Wikipedia blocked all our proxies.
|
|
|
|
# Retry later, so we have to store our list that is still to fetch.
|
|
|
|
|
|
|
|
cursor = connection.cursor()
|
2019-02-25 12:07:52 +00:00
|
|
|
cursor.execute("INSERT INTO failed_to_fetch(page, depth) VALUES(%s, %s)", (link, depth - 1))
|
2019-02-02 15:06:57 +00:00
|
|
|
connection.commit()
|
|
|
|
|
2019-02-02 10:18:57 +00:00
|
|
|
|
|
|
|
|