From 7cde3c597a3bbde982639299f1323ea55e013cf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Kn=C3=BCttel?= Date: Fri, 15 Feb 2019 12:46:32 +0100 Subject: [PATCH] basically mysql is working --- exam/ex01/cache.py | 9 +++++---- exam/ex01/cfg.py | 2 +- exam/ex01/connectivity.py | 22 +++++++--------------- exam/ex01/dijkstra.py | 2 +- exam/ex01/proxy.py | 15 ++++++--------- exam/ex01/receive.py | 31 ++++++++++++++++++++++++++----- 6 files changed, 46 insertions(+), 35 deletions(-) diff --git a/exam/ex01/cache.py b/exam/ex01/cache.py index fe08665..449f47b 100644 --- a/exam/ex01/cache.py +++ b/exam/ex01/cache.py @@ -31,16 +31,17 @@ def get_cache(directory, name): , user=config["mysql_user"] , password=config["mysql_password"] , db=config["mysql_database"] - , charset="utf8") + , charset="utf8mb4") cursor = db.cursor() cursor.execute("CREATE TABLE IF NOT EXISTS proxies(proxy varchar(100), lasttime_could_not_be_used DECIMAL)") - cursor.execute("CREATE TABLE IF NOT EXISTS links(source varchar(50), destination varchar(50))") - cursor.execute("CREATE TABLE IF NOT EXISTS dijkstra_helper(name varchar(50) UNIQUE, value INT)") - cursor.execute("CREATE TABLE IF NOT EXISTS failed_to_fetch(title varchar(50), depth INT)") + cursor.execute("CREATE TABLE IF NOT EXISTS links(source varchar(100) CHARACTER SET utf8mb4, destination varchar(100) CHARACTER SET utf8mb4)") + cursor.execute("CREATE TABLE IF NOT EXISTS dijkstra_helper(name varchar(100) CHARACTER SET utf8mb4 UNIQUE, value INT)") + cursor.execute("CREATE TABLE IF NOT EXISTS failed_to_fetch(title varchar(100) CHARACTER SET utf8mb4, depth INT)") db.commit() + fetch_proxies(db) return db diff --git a/exam/ex01/cfg.py b/exam/ex01/cfg.py index 285dab9..21ddc27 100644 --- a/exam/ex01/cfg.py +++ b/exam/ex01/cfg.py @@ -4,5 +4,5 @@ config = { , "mysql_server": "172.17.0.2" , "mysql_user": "wikipedia" , "mysql_password": "wikipediastuff" - , "mysql_database": "wikipedia_link_data" + , "mysql_database": "wikipedia_link_db" } diff --git a/exam/ex01/connectivity.py b/exam/ex01/connectivity.py index b80cb06..4e1aba8 100644 --- a/exam/ex01/connectivity.py +++ b/exam/ex01/connectivity.py @@ -4,7 +4,7 @@ from cfg import config def can_reach(title, connection): cursor = connection.cursor() - cursor.execute("SELECT COUNT(destination) FROM links WHERE destination=?", (title, )) + cursor.execute("SELECT COUNT(destination) FROM links WHERE destination=%s", (title, )) count = cursor.fetchone()[0] return count > 0 @@ -17,20 +17,12 @@ def shortest_path(center, title, connection): path = deque() while(current_title != center): path.append(current_title) - if(config["use_sqlite"]): - cursor.execute('''SELECT links.source - FROM links - LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name - WHERE links.destination=:title - ORDER BY dijkstra_helper.value ASC - LIMIT 1''', {"title": current_title}) - else: - cursor.execute('''SELECT links.source - FROM links - LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name - WHERE links.destination=:title - SORT BY dijkstra_helper.value ASC - LIMIT 1''', {"title": current_title}) + cursor.execute('''SELECT links.source + FROM links + LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name + WHERE links.destination=:title + ORDER BY dijkstra_helper.value ASC + LIMIT 1''', {"title": current_title}) current_title = cursor.fetchone()[0] return list(reversed(path)) diff --git a/exam/ex01/dijkstra.py b/exam/ex01/dijkstra.py index 3fe2b2c..db369b8 100644 --- a/exam/ex01/dijkstra.py +++ b/exam/ex01/dijkstra.py @@ -50,7 +50,7 @@ def recursive_dijkstra(titles, value, connection): def dijkstra(title, connection): cursor = connection.cursor() - cursor.execute("UPDATE dijkstra_helper SET value=0 WHERE name=?", (title,)) + cursor.execute("UPDATE dijkstra_helper SET value=0 WHERE name=%s", (title,)) todos = dijkstra_one(title, 1, connection) recursive_dijkstra(todos, 2, connection) diff --git a/exam/ex01/proxy.py b/exam/ex01/proxy.py index e35b8f1..4d94540 100644 --- a/exam/ex01/proxy.py +++ b/exam/ex01/proxy.py @@ -18,10 +18,7 @@ class NoMoreProxiesException(Exception): def get_data_with_proxy(url, conn_object, visit_first=None): cursor = conn_object.cursor() # Assume that table name is proxies - if(config["use_sqlite"]): - cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''') - else: - cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies SORT BY lasttime_could_not_be_used ASC''') + cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''') headers = {} for i, lasttime_could_not_be_used in cursor: session = requests.Session() @@ -30,12 +27,12 @@ def get_data_with_proxy(url, conn_object, visit_first=None): response = session.get(url, headers=headers, timeout=3) except: # If proxy is invalid/inactive, update lasttime could not be used and go next proxy - cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''', + cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %d WHERE proxy = %s ''', (time.time(), i)) continue # If text is empty, update lasttime could not be used and go next proxy if not response.text or 399 < response.status_code < 600: - cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''', + cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %d WHERE proxy = %s ''', (time.time(), i)) continue return response.json() @@ -62,7 +59,7 @@ def fetch_proxies(connection): url = "http://{}:{}".format(ip_addr, port) if(not proxy_is_in_db(url, connection)): - cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,)) + cursor.execute("INSERT INTO proxies VALUES(%s, 0)", (url,)) cnt += 1 logging.info("added {} new proxies".format(cnt)) connection.commit() @@ -91,8 +88,8 @@ def _get_rows(soup): def proxy_is_in_db(url, connection): cursor = connection.cursor() - cursor.execute("SELECT proxy FROM proxies WHERE proxy = ?", (url,)) - return cursor.fetchall() != [] + cursor.execute("SELECT proxy FROM proxies WHERE proxy = %s", (url,)) + return bool(cursor.fetchall()) diff --git a/exam/ex01/receive.py b/exam/ex01/receive.py index 26c05f9..9d28fe2 100644 --- a/exam/ex01/receive.py +++ b/exam/ex01/receive.py @@ -4,6 +4,18 @@ from proxy import get_data_with_proxy, NoMoreProxiesException logger = logging.getLogger(__name__) +def ignore_title(title): + ignore_starters = ["Help:" + , "Wikipedia:" + , "Template:" + , "Template_talk:" + , "Category:" + ] + for ignore in ignore_starters: + if(title.startswith(ignore)): + return True + return False + def _receive_links(title, connection): url = construct_url(title) @@ -15,14 +27,22 @@ def _receive_links(title, connection): destination_title = page_data["title"].replace(" ", "_") # avoid 1-loops if(destination_title == title): - pass - cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title)) + continue + if(ignore_title(title)): + continue + cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title)) yield destination_title else: for destination in page_data["links"]: + if(ignore_title(title)): + continue destination_title = destination["title"].replace(" ", "_") - cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title)) + try: + cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title)) + except Exception as e: + print(destination_title) + raise e yield destination_title connection.commit() @@ -36,7 +56,8 @@ def receive_link_graph(title, connection, depth): return cursor = connection.cursor() - cursor.execute("SELECT COUNT(source) FROM links WHERE source=?", (title,)) + print(repr(title)) + cursor.execute("SELECT COUNT(source) FROM links WHERE source=%s", (title,)) if(cursor.fetchone()[0] != 0): # we fetched that title already return @@ -52,7 +73,7 @@ def receive_link_graph(title, connection, depth): # Retry later, so we have to store our list that is still to fetch. cursor = connection.cursor() - cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(?, ?)", (link, depth - 1)) + cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(%s, %d)", (link, depth - 1)) connection.commit()