50 lines
1.4 KiB
Python
50 lines
1.4 KiB
Python
|
import logging
|
||
|
from url import construct_url
|
||
|
from proxy import get_data_with_proxy
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
def _receive_links(title, connection):
|
||
|
url = construct_url(title)
|
||
|
|
||
|
result = get_data_with_proxy(url, connection)
|
||
|
# This is basically because we don't know the page ID.
|
||
|
for k, page_data in result["query"]["pages"].items():
|
||
|
cursor = connection.cursor()
|
||
|
if(not "links" in page_data):
|
||
|
destination_title = page_data["title"].replace(" ", "_")
|
||
|
# avoid 1-loops
|
||
|
if(destination_title == title):
|
||
|
pass
|
||
|
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
|
||
|
yield destination_title
|
||
|
|
||
|
else:
|
||
|
for destination in page_data["links"]:
|
||
|
destination_title = destination["title"].replace(" ", "_")
|
||
|
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
|
||
|
yield destination_title
|
||
|
connection.commit()
|
||
|
|
||
|
def receive_links(title, connection):
|
||
|
return list(_receive_links(title, connection))
|
||
|
|
||
|
|
||
|
def receive_link_graph(title, connection, depth):
|
||
|
if(depth < 0):
|
||
|
# end of recursion
|
||
|
return
|
||
|
|
||
|
cursor = connection.cursor()
|
||
|
cursor.execute("SELECT COUNT(source) FROM links WHERE source=?", (title,))
|
||
|
if(cursor.fetchone()[0] != 0):
|
||
|
# we fetched that title already
|
||
|
return
|
||
|
|
||
|
logger.info("fetching links for {}".format(title))
|
||
|
|
||
|
for link in _receive_links(title, connection):
|
||
|
receive_link_graph(link, connection, depth - 1)
|
||
|
|
||
|
|