finished language support

2019-02-25 14:43:15 +01:00
parent bbbfd9eb57
commit e7b8b25528
5 changed files with 19 additions and 14 deletions
@@ -31,6 +31,8 @@ the mysql connection information is correct.
 Then one must edit the name of the article to analyze around
 and the depth to receive the links. After this is done the
 link graph can be received (using ``python3 main.py``).
+One can specify the language to use using a language
+abbreviation in ``receive_link_graph``. 

 It might be necessary to run this part several times if the
 program was unable to fetch all links. One can check for
@@ -43,4 +45,3 @@ in-memory, it is however possible to do it with SQL. Using
 SQL is recommended only, if the data exceeds the RAM, as it
 is way slower.

-
@@ -7,8 +7,10 @@ if(not config["use_sqlite"]):

 from proxy import fetch_proxies

-def get_cache(directory, name):
+def get_cache():
 	if(config["use_sqlite"]):
+		directory = config["sqlite_cache_directory"]
+		name = config["sqlite_cache_name"]
 		cache_file = os.path.join(directory, "{}.sqlite".format(name))
 		if(not os.path.exists(cache_file)):
 			with open(cache_file, "w") as fin:
@@ -5,6 +5,8 @@ config = {
 	, "mysql_user": "wikipedia"
 	, "mysql_password": "wikipediastuff"
 	, "mysql_database": "wikipedia_link_db"
+	, "sqlite_cache_directory": "./cache/"
+	, "sqlite_cache_name": "Angela_Merkel"
 }

 if(config["use_sqlite"]):
@@ -11,8 +11,8 @@ from connectivity import shortest_path
 from graph import DijkstraHelper
 from db_util import get_page_id

-cache = get_cache("./cache/", "Angela_Merkel")
-receive_link_graph("Angela_Merkel", cache, 2)
+cache = get_cache()
+receive_link_graph("Angela_Merkel", cache, 2, lang="en")

 cursor = cache.cursor()
 cursor.execute("SELECT COUNT(source) FROM links")
@@ -21,9 +21,9 @@ def ignore_title(title):
 			return True
 	return False

-def _receive_links(page, connection):
+def _receive_links(page, connection, lang="en"):
 	title = get_page_title(page, connection)
-	url = construct_url(title)
+	url = construct_url(title, lang=lang)

 	result = get_data_with_proxy(url, connection)

@@ -53,20 +53,20 @@ def _receive_links(page, connection):
 				yield destination
 		connection.commit()

-def receive_links(title, connection):
-	return list(_receive_links(title, connection))
+def receive_links(title, connection, lang="en"):
+	return list(_receive_links(title, connection, lang=lang))


-def receive_link_graph(title, connection, depth):
+def receive_link_graph(title, connection, depth, lang="en"):
 	page = get_page_id(title, connection)
-	do_receive_link_graph(page, connection, depth, fetch_missing=True)
+	do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang)

 	cursor = connection.cursor()
 	cursor.execute(sql.statements["count_failed_to_fetch"])
 	if(cursor.fetchone()[0]):
-		do_receive_link_graph(page, connection, depth, fetch_missing=True)
+		do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang)

-def do_receive_link_graph(page, connection, depth, fetch_missing=False):
+def do_receive_link_graph(page, connection, depth, fetch_missing=False, lang="en"):
 	if(depth < 0):
 		# end of recursion
 		return
@@ -78,7 +78,7 @@ def do_receive_link_graph(page, connection, depth, fetch_missing=False):
 		delete_cursor = connection.cursor()
 		cursor.execute(sql.statements["get_failed_to_fetch"])
 		for d, p in cursor:
-			do_receive_link_graph(p, connection, d, fetch_missing=False)
+			do_receive_link_graph(p, connection, d, fetch_missing=False, lang=lang)
 			delete_cursor.execute(sql.statements["delete_failed_to_fetch"], (p,))


@@ -93,7 +93,7 @@ def do_receive_link_graph(page, connection, depth, fetch_missing=False):

 	for link in _receive_links(page, connection):
 		try:
-			do_receive_link_graph(link, connection, depth - 1)
+			do_receive_link_graph(link, connection, depth - 1, lang=lang)
 		except NoMoreProxiesException as e:
 			logger.exception("All proxies are blocked")
 			# Wikipedia blocked all our proxies.