From bbbfd9eb5765a5ea6e7716a90d98ba2d9ab3b216 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Kn=C3=BCttel?= Date: Mon, 25 Feb 2019 14:42:00 +0100 Subject: [PATCH 1/2] added README --- exam/ex01/README.rst | 46 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 exam/ex01/README.rst diff --git a/exam/ex01/README.rst b/exam/ex01/README.rst new file mode 100644 index 0000000..4968f9a --- /dev/null +++ b/exam/ex01/README.rst @@ -0,0 +1,46 @@ +Wikipedia Link Graph Analyzer +***************************** + +.. contents:: + +Configuration +============= + +Configuration is done in the file ``cfg.py``. There one can +specify whether the system should use a sqlite or a mysql +backend. Using the sqlite backend is faster for fetching the +data because sqlite omits implicit keys. However when one +wants to analyze the data using SQL instead of the pure +python implementation mysql is faster. + +It is recommended to use sqlite for fetching the data, then +transferring it to a mysql database and use this database +for analyzing. + +The main options in ``cfg.py`` are whether to use mysql or +sqlite and options for those systems. + +Invocation +========== + +Before invocating the program one should make sure that the +`configuration`_ is correct, in particular whether the cache +directory and cache name are set correctly for sqlite and +the mysql connection information is correct. + +Then one must edit the name of the article to analyze around +and the depth to receive the links. After this is done the +link graph can be received (using ``python3 main.py``). + +It might be necessary to run this part several times if the +program was unable to fetch all links. One can check for +unreceived data by executing ``SELECT COUNT(*) FROM +failed_to_fetch``. The result should be 0. + +Then the script uses Dijkstra's Algorithm in width-first +mode to analyze the graph. By default this is done +in-memory, it is however possible to do it with SQL. Using +SQL is recommended only, if the data exceeds the RAM, as it +is way slower. + + From e7b8b255280fc453114ad82ad5c337174203a1b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Kn=C3=BCttel?= Date: Mon, 25 Feb 2019 14:43:15 +0100 Subject: [PATCH 2/2] finished language support --- exam/ex01/README.rst | 3 ++- exam/ex01/cache.py | 4 +++- exam/ex01/cfg.py | 2 ++ exam/ex01/main.py | 4 ++-- exam/ex01/receive.py | 20 ++++++++++---------- 5 files changed, 19 insertions(+), 14 deletions(-) diff --git a/exam/ex01/README.rst b/exam/ex01/README.rst index 4968f9a..76d5e77 100644 --- a/exam/ex01/README.rst +++ b/exam/ex01/README.rst @@ -31,6 +31,8 @@ the mysql connection information is correct. Then one must edit the name of the article to analyze around and the depth to receive the links. After this is done the link graph can be received (using ``python3 main.py``). +One can specify the language to use using a language +abbreviation in ``receive_link_graph``. It might be necessary to run this part several times if the program was unable to fetch all links. One can check for @@ -43,4 +45,3 @@ in-memory, it is however possible to do it with SQL. Using SQL is recommended only, if the data exceeds the RAM, as it is way slower. - diff --git a/exam/ex01/cache.py b/exam/ex01/cache.py index 8bad71b..6e81f6c 100644 --- a/exam/ex01/cache.py +++ b/exam/ex01/cache.py @@ -7,8 +7,10 @@ if(not config["use_sqlite"]): from proxy import fetch_proxies -def get_cache(directory, name): +def get_cache(): if(config["use_sqlite"]): + directory = config["sqlite_cache_directory"] + name = config["sqlite_cache_name"] cache_file = os.path.join(directory, "{}.sqlite".format(name)) if(not os.path.exists(cache_file)): with open(cache_file, "w") as fin: diff --git a/exam/ex01/cfg.py b/exam/ex01/cfg.py index 891f03d..904133e 100644 --- a/exam/ex01/cfg.py +++ b/exam/ex01/cfg.py @@ -5,6 +5,8 @@ config = { , "mysql_user": "wikipedia" , "mysql_password": "wikipediastuff" , "mysql_database": "wikipedia_link_db" + , "sqlite_cache_directory": "./cache/" + , "sqlite_cache_name": "Angela_Merkel" } if(config["use_sqlite"]): diff --git a/exam/ex01/main.py b/exam/ex01/main.py index 4cacf25..1657d98 100644 --- a/exam/ex01/main.py +++ b/exam/ex01/main.py @@ -11,8 +11,8 @@ from connectivity import shortest_path from graph import DijkstraHelper from db_util import get_page_id -cache = get_cache("./cache/", "Angela_Merkel") -receive_link_graph("Angela_Merkel", cache, 2) +cache = get_cache() +receive_link_graph("Angela_Merkel", cache, 2, lang="en") cursor = cache.cursor() cursor.execute("SELECT COUNT(source) FROM links") diff --git a/exam/ex01/receive.py b/exam/ex01/receive.py index 84ee232..babc962 100644 --- a/exam/ex01/receive.py +++ b/exam/ex01/receive.py @@ -21,9 +21,9 @@ def ignore_title(title): return True return False -def _receive_links(page, connection): +def _receive_links(page, connection, lang="en"): title = get_page_title(page, connection) - url = construct_url(title) + url = construct_url(title, lang=lang) result = get_data_with_proxy(url, connection) @@ -53,20 +53,20 @@ def _receive_links(page, connection): yield destination connection.commit() -def receive_links(title, connection): - return list(_receive_links(title, connection)) +def receive_links(title, connection, lang="en"): + return list(_receive_links(title, connection, lang=lang)) -def receive_link_graph(title, connection, depth): +def receive_link_graph(title, connection, depth, lang="en"): page = get_page_id(title, connection) - do_receive_link_graph(page, connection, depth, fetch_missing=True) + do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang) cursor = connection.cursor() cursor.execute(sql.statements["count_failed_to_fetch"]) if(cursor.fetchone()[0]): - do_receive_link_graph(page, connection, depth, fetch_missing=True) + do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang) -def do_receive_link_graph(page, connection, depth, fetch_missing=False): +def do_receive_link_graph(page, connection, depth, fetch_missing=False, lang="en"): if(depth < 0): # end of recursion return @@ -78,7 +78,7 @@ def do_receive_link_graph(page, connection, depth, fetch_missing=False): delete_cursor = connection.cursor() cursor.execute(sql.statements["get_failed_to_fetch"]) for d, p in cursor: - do_receive_link_graph(p, connection, d, fetch_missing=False) + do_receive_link_graph(p, connection, d, fetch_missing=False, lang=lang) delete_cursor.execute(sql.statements["delete_failed_to_fetch"], (p,)) @@ -93,7 +93,7 @@ def do_receive_link_graph(page, connection, depth, fetch_missing=False): for link in _receive_links(page, connection): try: - do_receive_link_graph(link, connection, depth - 1) + do_receive_link_graph(link, connection, depth - 1, lang=lang) except NoMoreProxiesException as e: logger.exception("All proxies are blocked") # Wikipedia blocked all our proxies.