Merge branch 'master' of https://daknuett.eu/gitea/daknuett/scientific-programming-exercises

2019-02-27 11:43:06 +01:00
parent 9ba52e80f1 e7b8b25528
commit 843615b0c0
5 changed files with 64 additions and 13 deletions
@@ -0,0 +1,47 @@
+Wikipedia Link Graph Analyzer
+*****************************
+
+.. contents::
+
+Configuration
+=============
+
+Configuration is done in the file ``cfg.py``. There one can
+specify whether the system should use a sqlite or a mysql
+backend. Using the sqlite backend is faster for fetching the
+data because sqlite omits implicit keys. However when one
+wants to analyze the data using SQL instead of the pure
+python implementation mysql is faster.
+
+It is recommended to use sqlite for fetching the data, then
+transferring it to a mysql database and use this database
+for analyzing. 
+
+The main options in ``cfg.py`` are whether to use mysql or
+sqlite and options for those systems.
+
+Invocation
+==========
+
+Before invocating the program one should make sure that the
+`configuration`_ is correct, in particular whether the cache
+directory and cache name are set correctly for sqlite and
+the mysql connection information is correct.
+
+Then one must edit the name of the article to analyze around
+and the depth to receive the links. After this is done the
+link graph can be received (using ``python3 main.py``).
+One can specify the language to use using a language
+abbreviation in ``receive_link_graph``. 
+
+It might be necessary to run this part several times if the
+program was unable to fetch all links. One can check for
+unreceived data by executing ``SELECT COUNT(*) FROM
+failed_to_fetch``. The result should be 0.
+
+Then the script uses Dijkstra's Algorithm in width-first
+mode to analyze the graph. By default this is done
+in-memory, it is however possible to do it with SQL. Using
+SQL is recommended only, if the data exceeds the RAM, as it
+is way slower.
+
@@ -7,8 +7,10 @@ if(not config["use_sqlite"]):

 from proxy import fetch_proxies

-def get_cache(directory, name):
+def get_cache():
 	if(config["use_sqlite"]):
+		directory = config["sqlite_cache_directory"]
+		name = config["sqlite_cache_name"]
 		cache_file = os.path.join(directory, "{}.sqlite".format(name))
 		if(not os.path.exists(cache_file)):
 			with open(cache_file, "w") as fin:
@@ -5,6 +5,8 @@ config = {
 	, "mysql_user": "wikipedia"
 	, "mysql_password": "wikipediastuff"
 	, "mysql_database": "wikipedia_link_db"
+	, "sqlite_cache_directory": "./cache/"
+	, "sqlite_cache_name": "Angela_Merkel"
 }

 if(config["use_sqlite"]):
@@ -11,8 +11,8 @@ from connectivity import shortest_path
 from graph import DijkstraHelper
 from db_util import get_page_id, get_page_title

-cache = get_cache("./cache/", "Angela_Merkel")
-receive_link_graph("Angela_Merkel", cache, 2)
+cache = get_cache()
+receive_link_graph("Angela_Merkel", cache, 2, lang="en")

 cursor = cache.cursor()
 cursor.execute("SELECT COUNT(source) FROM links")
@@ -21,9 +21,9 @@ def ignore_title(title):
 			return True
 	return False

-def _receive_links(page, connection):
+def _receive_links(page, connection, lang="en"):
 	title = get_page_title(page, connection)
-	url = construct_url(title)
+	url = construct_url(title, lang=lang)

 	result = get_data_with_proxy(url, connection)

@@ -53,20 +53,20 @@ def _receive_links(page, connection):
 				yield destination
 		connection.commit()

-def receive_links(title, connection):
-	return list(_receive_links(title, connection))
+def receive_links(title, connection, lang="en"):
+	return list(_receive_links(title, connection, lang=lang))


-def receive_link_graph(title, connection, depth):
+def receive_link_graph(title, connection, depth, lang="en"):
 	page = get_page_id(title, connection)
-	do_receive_link_graph(page, connection, depth, fetch_missing=True)
+	do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang)

 	cursor = connection.cursor()
 	cursor.execute(sql.statements["count_failed_to_fetch"])
 	if(cursor.fetchone()[0]):
-		do_receive_link_graph(page, connection, depth, fetch_missing=True)
+		do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang)

-def do_receive_link_graph(page, connection, depth, fetch_missing=False):
+def do_receive_link_graph(page, connection, depth, fetch_missing=False, lang="en"):
 	if(depth < 0):
 		# end of recursion
 		return
@@ -78,7 +78,7 @@ def do_receive_link_graph(page, connection, depth, fetch_missing=False):
 		delete_cursor = connection.cursor()
 		cursor.execute(sql.statements["get_failed_to_fetch"])
 		for d, p in cursor:
-			do_receive_link_graph(p, connection, d, fetch_missing=False)
+			do_receive_link_graph(p, connection, d, fetch_missing=False, lang=lang)
 			delete_cursor.execute(sql.statements["delete_failed_to_fetch"], (p,))


@@ -93,7 +93,7 @@ def do_receive_link_graph(page, connection, depth, fetch_missing=False):

 	for link in _receive_links(page, connection):
 		try:
-			do_receive_link_graph(link, connection, depth - 1)
+			do_receive_link_graph(link, connection, depth - 1, lang=lang)
 		except NoMoreProxiesException as e:
 			logger.exception("All proxies are blocked")
 			# Wikipedia blocked all our proxies.