fetching the list works

2019-02-02 16:06:57 +01:00
parent 783ec462e3
commit dabddc352f
4 changed files with 36 additions and 10 deletions
--- a/exam/ex01/cache.py
+++ b/exam/ex01/cache.py
@@ -3,8 +3,8 @@ import sqlite3

 from proxy import fetch_proxies

-def get_cache(directory):
-	cache_file = os.path.join(directory, "cache.sqlite")
+def get_cache(directory, name):
+	cache_file = os.path.join(directory, "{}.sqlite".format(name))
 	if(not os.path.exists(cache_file)):
 		with open(cache_file, "w") as fin:
 			pass
@@ -14,7 +14,8 @@ def get_cache(directory):

 		cursor.execute("CREATE TABLE proxies(proxy TEXT, lasttime_could_not_be_used DECIMAL)")
 		cursor.execute("CREATE TABLE links(source TEXT, destination TEXT)")
-		cursor.execute("CREATE TABLE dijkstra_helper(name TEXT, value INT)")
+		cursor.execute("CREATE TABLE dijkstra_helper(name TEXT UNIQUE, value INT)")
+		cursor.execute("CREATE TABLE failed_to_fetch(title TEXT, depth INT)")

 		db.commit()
 	db = sqlite3.connect(cache_file)
--- a/exam/ex01/main.py
+++ b/exam/ex01/main.py
@@ -5,5 +5,15 @@ logging.basicConfig(level=logging.DEBUG)
 from cache import get_cache
 from receive import receive_links, receive_link_graph

-cache = get_cache("./cache/")
-receive_link_graph("Angela_Merkel", cache, 3)
+from dijkstra import prepare_dijkstra, dijkstra
+
+cache = get_cache("./cache/", "Angela_Merkel")
+receive_link_graph("Angela_Merkel", cache, 2)
+
+cursor = cache.cursor()
+cursor.execute("SELECT COUNT(source) FROM links")
+print(cursor.fetchall())
+
+prepare_dijkstra(cache)
+dijkstra("Angela_Merkel", cache)
+
--- a/exam/ex01/proxy.py
+++ b/exam/ex01/proxy.py
@@ -10,6 +10,9 @@ from bs4 import BeautifulSoup

 logger = logging.getLogger(__name__)

+class NoMoreProxiesException(Exception): 
+	pass
+
 def get_data_with_proxy(url, conn_object, visit_first=None):
 	cursor = conn_object.cursor()
 	# Assume that table name is proxies
@@ -31,7 +34,7 @@ def get_data_with_proxy(url, conn_object, visit_first=None):
 					 (time.time(), i))
 			continue
 		return response.json()
-	raise Exception("No more proxies left")
+	raise NoMoreProxiesException("No more proxies left")


 def fetch_proxies(connection):
@@ -48,13 +51,15 @@ def fetch_proxies(connection):
 	soup = BeautifulSoup(resp.text, "html.parser")

 	cursor = connection.cursor()
-	for i, (ip_addr, port) in enumerate(_get_rows(soup)):
+	cnt = 0
+	for ip_addr, port in _get_rows(soup):

 		url = "http://{}:{}".format(ip_addr, port)

 		if(not proxy_is_in_db(url, connection)):
 			cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,))
-	logging.info("added {} new proxies".format(i))
+			cnt += 1
+	logging.info("added {} new proxies".format(cnt))
 	connection.commit()


--- a/exam/ex01/receive.py
+++ b/exam/ex01/receive.py
@@ -1,6 +1,6 @@
 import logging
 from url import construct_url
-from proxy import get_data_with_proxy
+from proxy import get_data_with_proxy, NoMoreProxiesException

 logger = logging.getLogger(__name__)

@@ -44,6 +44,16 @@ def receive_link_graph(title, connection, depth):
 	logger.info("fetching links for {}".format(title))

 	for link in _receive_links(title, connection):
-		receive_link_graph(link, connection, depth - 1)
+		try:
+			receive_link_graph(link, connection, depth - 1)
+		except NoMoreProxiesException as e:
+			logger.exception("All proxies are blocked")
+			# Wikipedia blocked all our proxies. 
+			# Retry later, so we have to store our list that is still to fetch.
+
+			cursor = connection.cursor() 
+			cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(?, ?)", (link, depth - 1))
+			connection.commit()
+