fetching the list works

This commit is contained in:
Daniel Knüttel 2019-02-02 16:06:57 +01:00
parent 783ec462e3
commit dabddc352f
4 changed files with 36 additions and 10 deletions

View File

@ -3,8 +3,8 @@ import sqlite3
from proxy import fetch_proxies
def get_cache(directory):
cache_file = os.path.join(directory, "cache.sqlite")
def get_cache(directory, name):
cache_file = os.path.join(directory, "{}.sqlite".format(name))
if(not os.path.exists(cache_file)):
with open(cache_file, "w") as fin:
pass
@ -14,7 +14,8 @@ def get_cache(directory):
cursor.execute("CREATE TABLE proxies(proxy TEXT, lasttime_could_not_be_used DECIMAL)")
cursor.execute("CREATE TABLE links(source TEXT, destination TEXT)")
cursor.execute("CREATE TABLE dijkstra_helper(name TEXT, value INT)")
cursor.execute("CREATE TABLE dijkstra_helper(name TEXT UNIQUE, value INT)")
cursor.execute("CREATE TABLE failed_to_fetch(title TEXT, depth INT)")
db.commit()
db = sqlite3.connect(cache_file)

View File

@ -5,5 +5,15 @@ logging.basicConfig(level=logging.DEBUG)
from cache import get_cache
from receive import receive_links, receive_link_graph
cache = get_cache("./cache/")
receive_link_graph("Angela_Merkel", cache, 3)
from dijkstra import prepare_dijkstra, dijkstra
cache = get_cache("./cache/", "Angela_Merkel")
receive_link_graph("Angela_Merkel", cache, 2)
cursor = cache.cursor()
cursor.execute("SELECT COUNT(source) FROM links")
print(cursor.fetchall())
prepare_dijkstra(cache)
dijkstra("Angela_Merkel", cache)

View File

@ -10,6 +10,9 @@ from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
class NoMoreProxiesException(Exception):
pass
def get_data_with_proxy(url, conn_object, visit_first=None):
cursor = conn_object.cursor()
# Assume that table name is proxies
@ -31,7 +34,7 @@ def get_data_with_proxy(url, conn_object, visit_first=None):
(time.time(), i))
continue
return response.json()
raise Exception("No more proxies left")
raise NoMoreProxiesException("No more proxies left")
def fetch_proxies(connection):
@ -48,13 +51,15 @@ def fetch_proxies(connection):
soup = BeautifulSoup(resp.text, "html.parser")
cursor = connection.cursor()
for i, (ip_addr, port) in enumerate(_get_rows(soup)):
cnt = 0
for ip_addr, port in _get_rows(soup):
url = "http://{}:{}".format(ip_addr, port)
if(not proxy_is_in_db(url, connection)):
cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,))
logging.info("added {} new proxies".format(i))
cnt += 1
logging.info("added {} new proxies".format(cnt))
connection.commit()

View File

@ -1,6 +1,6 @@
import logging
from url import construct_url
from proxy import get_data_with_proxy
from proxy import get_data_with_proxy, NoMoreProxiesException
logger = logging.getLogger(__name__)
@ -44,6 +44,16 @@ def receive_link_graph(title, connection, depth):
logger.info("fetching links for {}".format(title))
for link in _receive_links(title, connection):
receive_link_graph(link, connection, depth - 1)
try:
receive_link_graph(link, connection, depth - 1)
except NoMoreProxiesException as e:
logger.exception("All proxies are blocked")
# Wikipedia blocked all our proxies.
# Retry later, so we have to store our list that is still to fetch.
cursor = connection.cursor()
cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(?, ?)", (link, depth - 1))
connection.commit()