Merge branch 'master' of https://daknuett.eu/gitea/daknuett/scientific-programming-exercises
This commit is contained in:
commit
843615b0c0
47
exam/ex01/README.rst
Normal file
47
exam/ex01/README.rst
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
Wikipedia Link Graph Analyzer
|
||||||
|
*****************************
|
||||||
|
|
||||||
|
.. contents::
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
=============
|
||||||
|
|
||||||
|
Configuration is done in the file ``cfg.py``. There one can
|
||||||
|
specify whether the system should use a sqlite or a mysql
|
||||||
|
backend. Using the sqlite backend is faster for fetching the
|
||||||
|
data because sqlite omits implicit keys. However when one
|
||||||
|
wants to analyze the data using SQL instead of the pure
|
||||||
|
python implementation mysql is faster.
|
||||||
|
|
||||||
|
It is recommended to use sqlite for fetching the data, then
|
||||||
|
transferring it to a mysql database and use this database
|
||||||
|
for analyzing.
|
||||||
|
|
||||||
|
The main options in ``cfg.py`` are whether to use mysql or
|
||||||
|
sqlite and options for those systems.
|
||||||
|
|
||||||
|
Invocation
|
||||||
|
==========
|
||||||
|
|
||||||
|
Before invocating the program one should make sure that the
|
||||||
|
`configuration`_ is correct, in particular whether the cache
|
||||||
|
directory and cache name are set correctly for sqlite and
|
||||||
|
the mysql connection information is correct.
|
||||||
|
|
||||||
|
Then one must edit the name of the article to analyze around
|
||||||
|
and the depth to receive the links. After this is done the
|
||||||
|
link graph can be received (using ``python3 main.py``).
|
||||||
|
One can specify the language to use using a language
|
||||||
|
abbreviation in ``receive_link_graph``.
|
||||||
|
|
||||||
|
It might be necessary to run this part several times if the
|
||||||
|
program was unable to fetch all links. One can check for
|
||||||
|
unreceived data by executing ``SELECT COUNT(*) FROM
|
||||||
|
failed_to_fetch``. The result should be 0.
|
||||||
|
|
||||||
|
Then the script uses Dijkstra's Algorithm in width-first
|
||||||
|
mode to analyze the graph. By default this is done
|
||||||
|
in-memory, it is however possible to do it with SQL. Using
|
||||||
|
SQL is recommended only, if the data exceeds the RAM, as it
|
||||||
|
is way slower.
|
||||||
|
|
|
@ -7,8 +7,10 @@ if(not config["use_sqlite"]):
|
||||||
|
|
||||||
from proxy import fetch_proxies
|
from proxy import fetch_proxies
|
||||||
|
|
||||||
def get_cache(directory, name):
|
def get_cache():
|
||||||
if(config["use_sqlite"]):
|
if(config["use_sqlite"]):
|
||||||
|
directory = config["sqlite_cache_directory"]
|
||||||
|
name = config["sqlite_cache_name"]
|
||||||
cache_file = os.path.join(directory, "{}.sqlite".format(name))
|
cache_file = os.path.join(directory, "{}.sqlite".format(name))
|
||||||
if(not os.path.exists(cache_file)):
|
if(not os.path.exists(cache_file)):
|
||||||
with open(cache_file, "w") as fin:
|
with open(cache_file, "w") as fin:
|
||||||
|
|
|
@ -5,6 +5,8 @@ config = {
|
||||||
, "mysql_user": "wikipedia"
|
, "mysql_user": "wikipedia"
|
||||||
, "mysql_password": "wikipediastuff"
|
, "mysql_password": "wikipediastuff"
|
||||||
, "mysql_database": "wikipedia_link_db"
|
, "mysql_database": "wikipedia_link_db"
|
||||||
|
, "sqlite_cache_directory": "./cache/"
|
||||||
|
, "sqlite_cache_name": "Angela_Merkel"
|
||||||
}
|
}
|
||||||
|
|
||||||
if(config["use_sqlite"]):
|
if(config["use_sqlite"]):
|
||||||
|
|
|
@ -11,8 +11,8 @@ from connectivity import shortest_path
|
||||||
from graph import DijkstraHelper
|
from graph import DijkstraHelper
|
||||||
from db_util import get_page_id, get_page_title
|
from db_util import get_page_id, get_page_title
|
||||||
|
|
||||||
cache = get_cache("./cache/", "Angela_Merkel")
|
cache = get_cache()
|
||||||
receive_link_graph("Angela_Merkel", cache, 2)
|
receive_link_graph("Angela_Merkel", cache, 2, lang="en")
|
||||||
|
|
||||||
cursor = cache.cursor()
|
cursor = cache.cursor()
|
||||||
cursor.execute("SELECT COUNT(source) FROM links")
|
cursor.execute("SELECT COUNT(source) FROM links")
|
||||||
|
|
|
@ -21,9 +21,9 @@ def ignore_title(title):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _receive_links(page, connection):
|
def _receive_links(page, connection, lang="en"):
|
||||||
title = get_page_title(page, connection)
|
title = get_page_title(page, connection)
|
||||||
url = construct_url(title)
|
url = construct_url(title, lang=lang)
|
||||||
|
|
||||||
result = get_data_with_proxy(url, connection)
|
result = get_data_with_proxy(url, connection)
|
||||||
|
|
||||||
|
@ -53,20 +53,20 @@ def _receive_links(page, connection):
|
||||||
yield destination
|
yield destination
|
||||||
connection.commit()
|
connection.commit()
|
||||||
|
|
||||||
def receive_links(title, connection):
|
def receive_links(title, connection, lang="en"):
|
||||||
return list(_receive_links(title, connection))
|
return list(_receive_links(title, connection, lang=lang))
|
||||||
|
|
||||||
|
|
||||||
def receive_link_graph(title, connection, depth):
|
def receive_link_graph(title, connection, depth, lang="en"):
|
||||||
page = get_page_id(title, connection)
|
page = get_page_id(title, connection)
|
||||||
do_receive_link_graph(page, connection, depth, fetch_missing=True)
|
do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang)
|
||||||
|
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
cursor.execute(sql.statements["count_failed_to_fetch"])
|
cursor.execute(sql.statements["count_failed_to_fetch"])
|
||||||
if(cursor.fetchone()[0]):
|
if(cursor.fetchone()[0]):
|
||||||
do_receive_link_graph(page, connection, depth, fetch_missing=True)
|
do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang)
|
||||||
|
|
||||||
def do_receive_link_graph(page, connection, depth, fetch_missing=False):
|
def do_receive_link_graph(page, connection, depth, fetch_missing=False, lang="en"):
|
||||||
if(depth < 0):
|
if(depth < 0):
|
||||||
# end of recursion
|
# end of recursion
|
||||||
return
|
return
|
||||||
|
@ -78,7 +78,7 @@ def do_receive_link_graph(page, connection, depth, fetch_missing=False):
|
||||||
delete_cursor = connection.cursor()
|
delete_cursor = connection.cursor()
|
||||||
cursor.execute(sql.statements["get_failed_to_fetch"])
|
cursor.execute(sql.statements["get_failed_to_fetch"])
|
||||||
for d, p in cursor:
|
for d, p in cursor:
|
||||||
do_receive_link_graph(p, connection, d, fetch_missing=False)
|
do_receive_link_graph(p, connection, d, fetch_missing=False, lang=lang)
|
||||||
delete_cursor.execute(sql.statements["delete_failed_to_fetch"], (p,))
|
delete_cursor.execute(sql.statements["delete_failed_to_fetch"], (p,))
|
||||||
|
|
||||||
|
|
||||||
|
@ -93,7 +93,7 @@ def do_receive_link_graph(page, connection, depth, fetch_missing=False):
|
||||||
|
|
||||||
for link in _receive_links(page, connection):
|
for link in _receive_links(page, connection):
|
||||||
try:
|
try:
|
||||||
do_receive_link_graph(link, connection, depth - 1)
|
do_receive_link_graph(link, connection, depth - 1, lang=lang)
|
||||||
except NoMoreProxiesException as e:
|
except NoMoreProxiesException as e:
|
||||||
logger.exception("All proxies are blocked")
|
logger.exception("All proxies are blocked")
|
||||||
# Wikipedia blocked all our proxies.
|
# Wikipedia blocked all our proxies.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user