finished language support

This commit is contained in:
Daniel Knüttel 2019-02-25 14:43:15 +01:00
parent bbbfd9eb57
commit e7b8b25528
5 changed files with 19 additions and 14 deletions

View File

@ -31,6 +31,8 @@ the mysql connection information is correct.
Then one must edit the name of the article to analyze around Then one must edit the name of the article to analyze around
and the depth to receive the links. After this is done the and the depth to receive the links. After this is done the
link graph can be received (using ``python3 main.py``). link graph can be received (using ``python3 main.py``).
One can specify the language to use using a language
abbreviation in ``receive_link_graph``.
It might be necessary to run this part several times if the It might be necessary to run this part several times if the
program was unable to fetch all links. One can check for program was unable to fetch all links. One can check for
@ -43,4 +45,3 @@ in-memory, it is however possible to do it with SQL. Using
SQL is recommended only, if the data exceeds the RAM, as it SQL is recommended only, if the data exceeds the RAM, as it
is way slower. is way slower.

View File

@ -7,8 +7,10 @@ if(not config["use_sqlite"]):
from proxy import fetch_proxies from proxy import fetch_proxies
def get_cache(directory, name): def get_cache():
if(config["use_sqlite"]): if(config["use_sqlite"]):
directory = config["sqlite_cache_directory"]
name = config["sqlite_cache_name"]
cache_file = os.path.join(directory, "{}.sqlite".format(name)) cache_file = os.path.join(directory, "{}.sqlite".format(name))
if(not os.path.exists(cache_file)): if(not os.path.exists(cache_file)):
with open(cache_file, "w") as fin: with open(cache_file, "w") as fin:

View File

@ -5,6 +5,8 @@ config = {
, "mysql_user": "wikipedia" , "mysql_user": "wikipedia"
, "mysql_password": "wikipediastuff" , "mysql_password": "wikipediastuff"
, "mysql_database": "wikipedia_link_db" , "mysql_database": "wikipedia_link_db"
, "sqlite_cache_directory": "./cache/"
, "sqlite_cache_name": "Angela_Merkel"
} }
if(config["use_sqlite"]): if(config["use_sqlite"]):

View File

@ -11,8 +11,8 @@ from connectivity import shortest_path
from graph import DijkstraHelper from graph import DijkstraHelper
from db_util import get_page_id from db_util import get_page_id
cache = get_cache("./cache/", "Angela_Merkel") cache = get_cache()
receive_link_graph("Angela_Merkel", cache, 2) receive_link_graph("Angela_Merkel", cache, 2, lang="en")
cursor = cache.cursor() cursor = cache.cursor()
cursor.execute("SELECT COUNT(source) FROM links") cursor.execute("SELECT COUNT(source) FROM links")

View File

@ -21,9 +21,9 @@ def ignore_title(title):
return True return True
return False return False
def _receive_links(page, connection): def _receive_links(page, connection, lang="en"):
title = get_page_title(page, connection) title = get_page_title(page, connection)
url = construct_url(title) url = construct_url(title, lang=lang)
result = get_data_with_proxy(url, connection) result = get_data_with_proxy(url, connection)
@ -53,20 +53,20 @@ def _receive_links(page, connection):
yield destination yield destination
connection.commit() connection.commit()
def receive_links(title, connection): def receive_links(title, connection, lang="en"):
return list(_receive_links(title, connection)) return list(_receive_links(title, connection, lang=lang))
def receive_link_graph(title, connection, depth): def receive_link_graph(title, connection, depth, lang="en"):
page = get_page_id(title, connection) page = get_page_id(title, connection)
do_receive_link_graph(page, connection, depth, fetch_missing=True) do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang)
cursor = connection.cursor() cursor = connection.cursor()
cursor.execute(sql.statements["count_failed_to_fetch"]) cursor.execute(sql.statements["count_failed_to_fetch"])
if(cursor.fetchone()[0]): if(cursor.fetchone()[0]):
do_receive_link_graph(page, connection, depth, fetch_missing=True) do_receive_link_graph(page, connection, depth, fetch_missing=True, lang=lang)
def do_receive_link_graph(page, connection, depth, fetch_missing=False): def do_receive_link_graph(page, connection, depth, fetch_missing=False, lang="en"):
if(depth < 0): if(depth < 0):
# end of recursion # end of recursion
return return
@ -78,7 +78,7 @@ def do_receive_link_graph(page, connection, depth, fetch_missing=False):
delete_cursor = connection.cursor() delete_cursor = connection.cursor()
cursor.execute(sql.statements["get_failed_to_fetch"]) cursor.execute(sql.statements["get_failed_to_fetch"])
for d, p in cursor: for d, p in cursor:
do_receive_link_graph(p, connection, d, fetch_missing=False) do_receive_link_graph(p, connection, d, fetch_missing=False, lang=lang)
delete_cursor.execute(sql.statements["delete_failed_to_fetch"], (p,)) delete_cursor.execute(sql.statements["delete_failed_to_fetch"], (p,))
@ -93,7 +93,7 @@ def do_receive_link_graph(page, connection, depth, fetch_missing=False):
for link in _receive_links(page, connection): for link in _receive_links(page, connection):
try: try:
do_receive_link_graph(link, connection, depth - 1) do_receive_link_graph(link, connection, depth - 1, lang=lang)
except NoMoreProxiesException as e: except NoMoreProxiesException as e:
logger.exception("All proxies are blocked") logger.exception("All proxies are blocked")
# Wikipedia blocked all our proxies. # Wikipedia blocked all our proxies.