scientific-programming-exer.../exam/ex01/receive.py

import logging
from url import construct_url
from proxy import get_data_with_proxy, NoMoreProxiesException

logger = logging.getLogger(__name__)

def ignore_title(title):
	ignore_starters = ["Help:"
			, "Wikipedia:"
			, "Template:"
			, "Template_talk:"
			, "Category:"
			]
	for ignore in ignore_starters:
		if(title.startswith(ignore)):
			return True
	return False

def _receive_links(title, connection):
	url = construct_url(title)

	result = get_data_with_proxy(url, connection)
	# This is basically because we don't know the page ID.
	for k, page_data in result["query"]["pages"].items():
		cursor = connection.cursor()
		if(not "links" in page_data):
			destination_title = page_data["title"].replace(" ", "_")
			# avoid 1-loops
			if(destination_title == title):
				continue
			if(ignore_title(title)):
				continue
			cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title))
			yield destination_title

		else:
			for destination in page_data["links"]:
				if(ignore_title(title)):
					continue
				destination_title = destination["title"].replace(" ", "_")
				try:
					cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title))
				except Exception as e:
					print(destination_title)
					raise e
				yield destination_title
		connection.commit()

def receive_links(title, connection):
	return list(_receive_links(title, connection))


def receive_link_graph(title, connection, depth):
	if(depth < 0):
		# end of recursion
		return

	cursor = connection.cursor()
	print(repr(title))
	cursor.execute("SELECT COUNT(source) FROM links WHERE source=%s", (title,))
	if(cursor.fetchone()[0] != 0):
		# we fetched that title already
		return

	logger.info("fetching links for {}".format(title))

	for link in _receive_links(title, connection):
		try:
			receive_link_graph(link, connection, depth - 1)
		except NoMoreProxiesException as e:
			logger.exception("All proxies are blocked")
			# Wikipedia blocked all our proxies. 
			# Retry later, so we have to store our list that is still to fetch.

			cursor = connection.cursor() 
			cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(%s, %d)", (link, depth - 1))
			connection.commit()
initial stuff for ex01 2019-02-02 10:18:57 +00:00			`import logging`
			`from url import construct_url`
fetching the list works 2019-02-02 15:06:57 +00:00			`from proxy import get_data_with_proxy, NoMoreProxiesException`
initial stuff for ex01 2019-02-02 10:18:57 +00:00
			`logger = logging.getLogger(__name__)`

basically mysql is working 2019-02-15 11:46:32 +00:00			`def ignore_title(title):`
			`ignore_starters = ["Help:"`
			`, "Wikipedia:"`
			`, "Template:"`
			`, "Template_talk:"`
			`, "Category:"`
			`]`
			`for ignore in ignore_starters:`
			`if(title.startswith(ignore)):`
			`return True`
			`return False`

initial stuff for ex01 2019-02-02 10:18:57 +00:00			`def _receive_links(title, connection):`
			`url = construct_url(title)`

			`result = get_data_with_proxy(url, connection)`
			`# This is basically because we don't know the page ID.`
			`for k, page_data in result["query"]["pages"].items():`
			`cursor = connection.cursor()`
			`if(not "links" in page_data):`
			`destination_title = page_data["title"].replace(" ", "_")`
			`# avoid 1-loops`
			`if(destination_title == title):`
basically mysql is working 2019-02-15 11:46:32 +00:00			`continue`
			`if(ignore_title(title)):`
			`continue`
			`cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title))`
initial stuff for ex01 2019-02-02 10:18:57 +00:00			`yield destination_title`

			`else:`
			`for destination in page_data["links"]:`
basically mysql is working 2019-02-15 11:46:32 +00:00			`if(ignore_title(title)):`
			`continue`
initial stuff for ex01 2019-02-02 10:18:57 +00:00			`destination_title = destination["title"].replace(" ", "_")`
basically mysql is working 2019-02-15 11:46:32 +00:00			`try:`
			`cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title))`
			`except Exception as e:`
			`print(destination_title)`
			`raise e`
initial stuff for ex01 2019-02-02 10:18:57 +00:00			`yield destination_title`
			`connection.commit()`

			`def receive_links(title, connection):`
			`return list(_receive_links(title, connection))`


			`def receive_link_graph(title, connection, depth):`
			`if(depth < 0):`
			`# end of recursion`
			`return`

			`cursor = connection.cursor()`
basically mysql is working 2019-02-15 11:46:32 +00:00			`print(repr(title))`
			`cursor.execute("SELECT COUNT(source) FROM links WHERE source=%s", (title,))`
initial stuff for ex01 2019-02-02 10:18:57 +00:00			`if(cursor.fetchone()[0] != 0):`
			`# we fetched that title already`
			`return`

			`logger.info("fetching links for {}".format(title))`

			`for link in _receive_links(title, connection):`
fetching the list works 2019-02-02 15:06:57 +00:00			`try:`
			`receive_link_graph(link, connection, depth - 1)`
			`except NoMoreProxiesException as e:`
			`logger.exception("All proxies are blocked")`
			`# Wikipedia blocked all our proxies.`
			`# Retry later, so we have to store our list that is still to fetch.`

			`cursor = connection.cursor()`
basically mysql is working 2019-02-15 11:46:32 +00:00			`cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(%s, %d)", (link, depth - 1))`
fetching the list works 2019-02-02 15:06:57 +00:00			`connection.commit()`

initial stuff for ex01 2019-02-02 10:18:57 +00:00