scientific-programming-exer.../exam/ex01/receive.py

import logging
from url import construct_url
from proxy import get_data_with_proxy, NoMoreProxiesException

logger = logging.getLogger(__name__)

def _receive_links(title, connection):
	url = construct_url(title)

	result = get_data_with_proxy(url, connection)
	# This is basically because we don't know the page ID.
	for k, page_data in result["query"]["pages"].items():
		cursor = connection.cursor()
		if(not "links" in page_data):
			destination_title = page_data["title"].replace(" ", "_")
			# avoid 1-loops
			if(destination_title == title):
				pass
			cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
			yield destination_title

		else:
			for destination in page_data["links"]:
				destination_title = destination["title"].replace(" ", "_")
				cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
				yield destination_title
		connection.commit()

def receive_links(title, connection):
	return list(_receive_links(title, connection))


def receive_link_graph(title, connection, depth):
	if(depth < 0):
		# end of recursion
		return

	cursor = connection.cursor()
	cursor.execute("SELECT COUNT(source) FROM links WHERE source=?", (title,))
	if(cursor.fetchone()[0] != 0):
		# we fetched that title already
		return

	logger.info("fetching links for {}".format(title))

	for link in _receive_links(title, connection):
		try:
			receive_link_graph(link, connection, depth - 1)
		except NoMoreProxiesException as e:
			logger.exception("All proxies are blocked")
			# Wikipedia blocked all our proxies.
			# Retry later, so we have to store our list that is still to fetch.

			cursor = connection.cursor()
			cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(?, ?)", (link, depth - 1))
			connection.commit()