diff --git a/exam/ex01/cache.py b/exam/ex01/cache.py index 0dc0c80..e691b83 100644 --- a/exam/ex01/cache.py +++ b/exam/ex01/cache.py @@ -3,8 +3,8 @@ import sqlite3 from proxy import fetch_proxies -def get_cache(directory): - cache_file = os.path.join(directory, "cache.sqlite") +def get_cache(directory, name): + cache_file = os.path.join(directory, "{}.sqlite".format(name)) if(not os.path.exists(cache_file)): with open(cache_file, "w") as fin: pass @@ -14,7 +14,8 @@ def get_cache(directory): cursor.execute("CREATE TABLE proxies(proxy TEXT, lasttime_could_not_be_used DECIMAL)") cursor.execute("CREATE TABLE links(source TEXT, destination TEXT)") - cursor.execute("CREATE TABLE dijkstra_helper(name TEXT, value INT)") + cursor.execute("CREATE TABLE dijkstra_helper(name TEXT UNIQUE, value INT)") + cursor.execute("CREATE TABLE failed_to_fetch(title TEXT, depth INT)") db.commit() db = sqlite3.connect(cache_file) diff --git a/exam/ex01/main.py b/exam/ex01/main.py index 5c9a08a..e49f408 100644 --- a/exam/ex01/main.py +++ b/exam/ex01/main.py @@ -5,5 +5,15 @@ logging.basicConfig(level=logging.DEBUG) from cache import get_cache from receive import receive_links, receive_link_graph -cache = get_cache("./cache/") -receive_link_graph("Angela_Merkel", cache, 3) +from dijkstra import prepare_dijkstra, dijkstra + +cache = get_cache("./cache/", "Angela_Merkel") +receive_link_graph("Angela_Merkel", cache, 2) + +cursor = cache.cursor() +cursor.execute("SELECT COUNT(source) FROM links") +print(cursor.fetchall()) + +prepare_dijkstra(cache) +dijkstra("Angela_Merkel", cache) + diff --git a/exam/ex01/proxy.py b/exam/ex01/proxy.py index 61d383a..d44b531 100644 --- a/exam/ex01/proxy.py +++ b/exam/ex01/proxy.py @@ -10,6 +10,9 @@ from bs4 import BeautifulSoup logger = logging.getLogger(__name__) +class NoMoreProxiesException(Exception): + pass + def get_data_with_proxy(url, conn_object, visit_first=None): cursor = conn_object.cursor() # Assume that table name is proxies @@ -31,7 +34,7 @@ def get_data_with_proxy(url, conn_object, visit_first=None): (time.time(), i)) continue return response.json() - raise Exception("No more proxies left") + raise NoMoreProxiesException("No more proxies left") def fetch_proxies(connection): @@ -48,13 +51,15 @@ def fetch_proxies(connection): soup = BeautifulSoup(resp.text, "html.parser") cursor = connection.cursor() - for i, (ip_addr, port) in enumerate(_get_rows(soup)): + cnt = 0 + for ip_addr, port in _get_rows(soup): url = "http://{}:{}".format(ip_addr, port) if(not proxy_is_in_db(url, connection)): cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,)) - logging.info("added {} new proxies".format(i)) + cnt += 1 + logging.info("added {} new proxies".format(cnt)) connection.commit() diff --git a/exam/ex01/receive.py b/exam/ex01/receive.py index 876b4ea..26c05f9 100644 --- a/exam/ex01/receive.py +++ b/exam/ex01/receive.py @@ -1,6 +1,6 @@ import logging from url import construct_url -from proxy import get_data_with_proxy +from proxy import get_data_with_proxy, NoMoreProxiesException logger = logging.getLogger(__name__) @@ -44,6 +44,16 @@ def receive_link_graph(title, connection, depth): logger.info("fetching links for {}".format(title)) for link in _receive_links(title, connection): - receive_link_graph(link, connection, depth - 1) + try: + receive_link_graph(link, connection, depth - 1) + except NoMoreProxiesException as e: + logger.exception("All proxies are blocked") + # Wikipedia blocked all our proxies. + # Retry later, so we have to store our list that is still to fetch. + + cursor = connection.cursor() + cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(?, ?)", (link, depth - 1)) + connection.commit() +