diff --git a/exam/ex01/README.rst b/exam/ex01/README.rst new file mode 100644 index 0000000..76d5e77 --- /dev/null +++ b/exam/ex01/README.rst @@ -0,0 +1,47 @@ +Wikipedia Link Graph Analyzer +***************************** + +.. contents:: + +Configuration +============= + +Configuration is done in the file ``cfg.py``. There one can +specify whether the system should use a sqlite or a mysql +backend. Using the sqlite backend is faster for fetching the +data because sqlite omits implicit keys. However when one +wants to analyze the data using SQL instead of the pure +python implementation mysql is faster. + +It is recommended to use sqlite for fetching the data, then +transferring it to a mysql database and use this database +for analyzing. + +The main options in ``cfg.py`` are whether to use mysql or +sqlite and options for those systems. + +Invocation +========== + +Before invocating the program one should make sure that the +`configuration`_ is correct, in particular whether the cache +directory and cache name are set correctly for sqlite and +the mysql connection information is correct. + +Then one must edit the name of the article to analyze around +and the depth to receive the links. After this is done the +link graph can be received (using ``python3 main.py``). +One can specify the language to use using a language +abbreviation in ``receive_link_graph``. + +It might be necessary to run this part several times if the +program was unable to fetch all links. One can check for +unreceived data by executing ``SELECT COUNT(*) FROM +failed_to_fetch``. The result should be 0. + +Then the script uses Dijkstra's Algorithm in width-first +mode to analyze the graph. By default this is done +in-memory, it is however possible to do it with SQL. Using +SQL is recommended only, if the data exceeds the RAM, as it +is way slower. + diff --git a/exam/ex01/cache.py b/exam/ex01/cache.py index 8bad71b..6e81f6c 100644 --- a/exam/ex01/cache.py +++ b/exam/ex01/cache.py @@ -7,8 +7,10 @@ if(not config["use_sqlite"]): from proxy import fetch_proxies -def get_cache(directory, name): +def get_cache(): if(config["use_sqlite"]): + directory = config["sqlite_cache_directory"] + name = config["sqlite_cache_name"] cache_file = os.path.join(directory, "{}.sqlite".format(name)) if(not os.path.exists(cache_file)): with open(cache_file, "w") as fin: diff --git a/exam/ex01/cfg.py b/exam/ex01/cfg.py index 15a65db..3a8565d 100644 --- a/exam/ex01/cfg.py +++ b/exam/ex01/cfg.py @@ -5,6 +5,8 @@ config = { , "mysql_user": "wikipedia" , "mysql_password": "wikipediastuff" , "mysql_database": "wikipedia_link_db" + , "sqlite_cache_directory": "./cache/" + , "sqlite_cache_name": "Angela_Merkel" } if(config["use_sqlite"]): diff --git a/exam/ex01/main.py b/exam/ex01/main.py index fb6b8ad..7c5efe6 100644 --- a/exam/ex01/main.py +++ b/exam/ex01/main.py @@ -11,8 +11,8 @@ from connectivity import shortest_path from graph import DijkstraHelper from db_util import get_page_id, get_page_title -cache = get_cache("./cache/", "Angela_Merkel") -receive_link_graph("Angela_Merkel", cache, 2) +cache = get_cache() +receive_link_graph("Angela_Merkel", cache, 2, lang="en") cursor = cache.cursor() cursor.execute("SELECT COUNT(source) FROM links") diff --git a/exam/ex01/receive.py b/exam/ex01/receive.py index 84ee232..babc962 100644 --- a/exam/ex01/receive.py +++ b/exam/ex01/receive.py @@ -21,9 +21,9 @@ def ignore_title(title): return True return False -def _receive_links(page, connection): +def _receive_links(page, connection, lang="en"): title = get_page_title(page, connection) - url = construct_url(title) + url = construct_url(title, lang=lang) result = get_data_with_proxy(url, connection) @@ -53,20 +53,20 @@ def _receive_links(page, connection): yield destination connection.commit() -def receive_links(title, connection): - return list(_receive_links(title, connection)) +def receive_links(title, connection, lang="en"): + return list(_receive_links(title, connection, lang=lang)) -def receive_link_graph(title, connection, depth): +def receive_link_graph(title, connection, depth, lang="en"): page = get_page_id(title, connection) - do_receive_link_graph(page, connection, depth, fetch_missing=True) + do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang) cursor = connection.cursor() cursor.execute(sql.statements["count_failed_to_fetch"]) if(cursor.fetchone()[0]): - do_receive_link_graph(page, connection, depth, fetch_missing=True) + do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang) -def do_receive_link_graph(page, connection, depth, fetch_missing=False): +def do_receive_link_graph(page, connection, depth, fetch_missing=False, lang="en"): if(depth < 0): # end of recursion return @@ -78,7 +78,7 @@ def do_receive_link_graph(page, connection, depth, fetch_missing=False): delete_cursor = connection.cursor() cursor.execute(sql.statements["get_failed_to_fetch"]) for d, p in cursor: - do_receive_link_graph(p, connection, d, fetch_missing=False) + do_receive_link_graph(p, connection, d, fetch_missing=False, lang=lang) delete_cursor.execute(sql.statements["delete_failed_to_fetch"], (p,)) @@ -93,7 +93,7 @@ def do_receive_link_graph(page, connection, depth, fetch_missing=False): for link in _receive_links(page, connection): try: - do_receive_link_graph(link, connection, depth - 1) + do_receive_link_graph(link, connection, depth - 1, lang=lang) except NoMoreProxiesException as e: logger.exception("All proxies are blocked") # Wikipedia blocked all our proxies.