fetching the list works
This commit is contained in:
parent
783ec462e3
commit
dabddc352f
|
@ -3,8 +3,8 @@ import sqlite3
|
|||
|
||||
from proxy import fetch_proxies
|
||||
|
||||
def get_cache(directory):
|
||||
cache_file = os.path.join(directory, "cache.sqlite")
|
||||
def get_cache(directory, name):
|
||||
cache_file = os.path.join(directory, "{}.sqlite".format(name))
|
||||
if(not os.path.exists(cache_file)):
|
||||
with open(cache_file, "w") as fin:
|
||||
pass
|
||||
|
@ -14,7 +14,8 @@ def get_cache(directory):
|
|||
|
||||
cursor.execute("CREATE TABLE proxies(proxy TEXT, lasttime_could_not_be_used DECIMAL)")
|
||||
cursor.execute("CREATE TABLE links(source TEXT, destination TEXT)")
|
||||
cursor.execute("CREATE TABLE dijkstra_helper(name TEXT, value INT)")
|
||||
cursor.execute("CREATE TABLE dijkstra_helper(name TEXT UNIQUE, value INT)")
|
||||
cursor.execute("CREATE TABLE failed_to_fetch(title TEXT, depth INT)")
|
||||
|
||||
db.commit()
|
||||
db = sqlite3.connect(cache_file)
|
||||
|
|
|
@ -5,5 +5,15 @@ logging.basicConfig(level=logging.DEBUG)
|
|||
from cache import get_cache
|
||||
from receive import receive_links, receive_link_graph
|
||||
|
||||
cache = get_cache("./cache/")
|
||||
receive_link_graph("Angela_Merkel", cache, 3)
|
||||
from dijkstra import prepare_dijkstra, dijkstra
|
||||
|
||||
cache = get_cache("./cache/", "Angela_Merkel")
|
||||
receive_link_graph("Angela_Merkel", cache, 2)
|
||||
|
||||
cursor = cache.cursor()
|
||||
cursor.execute("SELECT COUNT(source) FROM links")
|
||||
print(cursor.fetchall())
|
||||
|
||||
prepare_dijkstra(cache)
|
||||
dijkstra("Angela_Merkel", cache)
|
||||
|
||||
|
|
|
@ -10,6 +10,9 @@ from bs4 import BeautifulSoup
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class NoMoreProxiesException(Exception):
|
||||
pass
|
||||
|
||||
def get_data_with_proxy(url, conn_object, visit_first=None):
|
||||
cursor = conn_object.cursor()
|
||||
# Assume that table name is proxies
|
||||
|
@ -31,7 +34,7 @@ def get_data_with_proxy(url, conn_object, visit_first=None):
|
|||
(time.time(), i))
|
||||
continue
|
||||
return response.json()
|
||||
raise Exception("No more proxies left")
|
||||
raise NoMoreProxiesException("No more proxies left")
|
||||
|
||||
|
||||
def fetch_proxies(connection):
|
||||
|
@ -48,13 +51,15 @@ def fetch_proxies(connection):
|
|||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
cursor = connection.cursor()
|
||||
for i, (ip_addr, port) in enumerate(_get_rows(soup)):
|
||||
cnt = 0
|
||||
for ip_addr, port in _get_rows(soup):
|
||||
|
||||
url = "http://{}:{}".format(ip_addr, port)
|
||||
|
||||
if(not proxy_is_in_db(url, connection)):
|
||||
cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,))
|
||||
logging.info("added {} new proxies".format(i))
|
||||
cnt += 1
|
||||
logging.info("added {} new proxies".format(cnt))
|
||||
connection.commit()
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import logging
|
||||
from url import construct_url
|
||||
from proxy import get_data_with_proxy
|
||||
from proxy import get_data_with_proxy, NoMoreProxiesException
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -44,6 +44,16 @@ def receive_link_graph(title, connection, depth):
|
|||
logger.info("fetching links for {}".format(title))
|
||||
|
||||
for link in _receive_links(title, connection):
|
||||
receive_link_graph(link, connection, depth - 1)
|
||||
try:
|
||||
receive_link_graph(link, connection, depth - 1)
|
||||
except NoMoreProxiesException as e:
|
||||
logger.exception("All proxies are blocked")
|
||||
# Wikipedia blocked all our proxies.
|
||||
# Retry later, so we have to store our list that is still to fetch.
|
||||
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(?, ?)", (link, depth - 1))
|
||||
connection.commit()
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user