scientific-programming-exer.../exam/ex01/receive.py

import logging
from collections import deque

from cfg import config
from url import construct_url
from proxy import get_data_with_proxy, NoMoreProxiesException
from db_util import get_page_id, get_page_title
import sql

logger = logging.getLogger(__name__)

def ignore_title(title):
	ignore_starters = ["Help:"
			, "Wikipedia:"
			, "Template:"
			, "Template_talk:"
			, "Category:"
			]
	for ignore in ignore_starters:
		if(title.startswith(ignore)):
			return True
	return False

def _receive_links(page, connection, lang="en"):
	title = get_page_title(page, connection)
	url = construct_url(title, lang=lang)

	result = get_data_with_proxy(url, connection)


	# This is basically because we don't know the page ID.
	for k, page_data in result["query"]["pages"].items():
		cursor = connection.cursor()
		if(not "links" in page_data):
			destination_title = page_data["title"].replace(" ", "_")
			# avoid 1-loops
			if(destination_title == title):
				continue
			if(ignore_title(destination_title)):
				continue
			destination = get_page_id(destination_title, connection)
			cursor.execute(sql.statements["insert_link"], (page, destination))
			yield destination

		else:
			for destination in page_data["links"]:
				destination_title = destination["title"].replace(" ", "_")
				if(ignore_title(destination_title)):
					continue
				destination = get_page_id(destination_title, connection)
				cursor.execute(sql.statements["insert_link"], (page, destination))
				yield destination
		connection.commit()

def receive_links(title, connection, lang="en"):
	return list(_receive_links(title, connection, lang=lang))


def receive_link_graph(title, connection, depth, lang="en"):
	page = get_page_id(title, connection)
	do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang)

	cursor = connection.cursor()
	cursor.execute(sql.statements["count_failed_to_fetch"])
	if(cursor.fetchone()[0]):
		do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang)

def do_receive_link_graph(page, connection, depth, fetch_missing=False, lang="en"):
	if(depth < 0):
		# end of recursion
		return
	logger.info("do_receive_link_graph(%d, <connection>, %d)" % (page, depth))
	cursor = connection.cursor()

	# Fetch the missing links.
	if(fetch_missing):
		delete_cursor = connection.cursor()
		cursor.execute(sql.statements["get_failed_to_fetch"])
		for d, p in cursor:
			do_receive_link_graph(p, connection, d, fetch_missing=False, lang=lang)
			delete_cursor.execute(sql.statements["delete_failed_to_fetch"], (p,))


	cursor = connection.cursor()
	cursor.execute(sql.statements["count_links_from"], (page,))
	if(cursor.fetchone()[0] != 0):
		# we fetched that title already
		return

	logger.info("fetching links for {}".format(page))

	for link in _receive_links(page, connection):
		try:
			do_receive_link_graph(link, connection, depth - 1, lang=lang)
		except NoMoreProxiesException as e:
			logger.exception("All proxies are blocked")
			# Wikipedia blocked all our proxies.
			# Retry later, so we have to store our list that is still to fetch.

			cursor = connection.cursor()
			cursor.execute(sql.statements["insert_failed_to_fetch"], (link, depth - 1))
			connection.commit()