fetching the list works
This commit is contained in:
parent
783ec462e3
commit
dabddc352f
|
@ -3,8 +3,8 @@ import sqlite3
|
||||||
|
|
||||||
from proxy import fetch_proxies
|
from proxy import fetch_proxies
|
||||||
|
|
||||||
def get_cache(directory):
|
def get_cache(directory, name):
|
||||||
cache_file = os.path.join(directory, "cache.sqlite")
|
cache_file = os.path.join(directory, "{}.sqlite".format(name))
|
||||||
if(not os.path.exists(cache_file)):
|
if(not os.path.exists(cache_file)):
|
||||||
with open(cache_file, "w") as fin:
|
with open(cache_file, "w") as fin:
|
||||||
pass
|
pass
|
||||||
|
@ -14,7 +14,8 @@ def get_cache(directory):
|
||||||
|
|
||||||
cursor.execute("CREATE TABLE proxies(proxy TEXT, lasttime_could_not_be_used DECIMAL)")
|
cursor.execute("CREATE TABLE proxies(proxy TEXT, lasttime_could_not_be_used DECIMAL)")
|
||||||
cursor.execute("CREATE TABLE links(source TEXT, destination TEXT)")
|
cursor.execute("CREATE TABLE links(source TEXT, destination TEXT)")
|
||||||
cursor.execute("CREATE TABLE dijkstra_helper(name TEXT, value INT)")
|
cursor.execute("CREATE TABLE dijkstra_helper(name TEXT UNIQUE, value INT)")
|
||||||
|
cursor.execute("CREATE TABLE failed_to_fetch(title TEXT, depth INT)")
|
||||||
|
|
||||||
db.commit()
|
db.commit()
|
||||||
db = sqlite3.connect(cache_file)
|
db = sqlite3.connect(cache_file)
|
||||||
|
|
|
@ -5,5 +5,15 @@ logging.basicConfig(level=logging.DEBUG)
|
||||||
from cache import get_cache
|
from cache import get_cache
|
||||||
from receive import receive_links, receive_link_graph
|
from receive import receive_links, receive_link_graph
|
||||||
|
|
||||||
cache = get_cache("./cache/")
|
from dijkstra import prepare_dijkstra, dijkstra
|
||||||
receive_link_graph("Angela_Merkel", cache, 3)
|
|
||||||
|
cache = get_cache("./cache/", "Angela_Merkel")
|
||||||
|
receive_link_graph("Angela_Merkel", cache, 2)
|
||||||
|
|
||||||
|
cursor = cache.cursor()
|
||||||
|
cursor.execute("SELECT COUNT(source) FROM links")
|
||||||
|
print(cursor.fetchall())
|
||||||
|
|
||||||
|
prepare_dijkstra(cache)
|
||||||
|
dijkstra("Angela_Merkel", cache)
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,9 @@ from bs4 import BeautifulSoup
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class NoMoreProxiesException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
def get_data_with_proxy(url, conn_object, visit_first=None):
|
def get_data_with_proxy(url, conn_object, visit_first=None):
|
||||||
cursor = conn_object.cursor()
|
cursor = conn_object.cursor()
|
||||||
# Assume that table name is proxies
|
# Assume that table name is proxies
|
||||||
|
@ -31,7 +34,7 @@ def get_data_with_proxy(url, conn_object, visit_first=None):
|
||||||
(time.time(), i))
|
(time.time(), i))
|
||||||
continue
|
continue
|
||||||
return response.json()
|
return response.json()
|
||||||
raise Exception("No more proxies left")
|
raise NoMoreProxiesException("No more proxies left")
|
||||||
|
|
||||||
|
|
||||||
def fetch_proxies(connection):
|
def fetch_proxies(connection):
|
||||||
|
@ -48,13 +51,15 @@ def fetch_proxies(connection):
|
||||||
soup = BeautifulSoup(resp.text, "html.parser")
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
for i, (ip_addr, port) in enumerate(_get_rows(soup)):
|
cnt = 0
|
||||||
|
for ip_addr, port in _get_rows(soup):
|
||||||
|
|
||||||
url = "http://{}:{}".format(ip_addr, port)
|
url = "http://{}:{}".format(ip_addr, port)
|
||||||
|
|
||||||
if(not proxy_is_in_db(url, connection)):
|
if(not proxy_is_in_db(url, connection)):
|
||||||
cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,))
|
cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,))
|
||||||
logging.info("added {} new proxies".format(i))
|
cnt += 1
|
||||||
|
logging.info("added {} new proxies".format(cnt))
|
||||||
connection.commit()
|
connection.commit()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import logging
|
import logging
|
||||||
from url import construct_url
|
from url import construct_url
|
||||||
from proxy import get_data_with_proxy
|
from proxy import get_data_with_proxy, NoMoreProxiesException
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -44,6 +44,16 @@ def receive_link_graph(title, connection, depth):
|
||||||
logger.info("fetching links for {}".format(title))
|
logger.info("fetching links for {}".format(title))
|
||||||
|
|
||||||
for link in _receive_links(title, connection):
|
for link in _receive_links(title, connection):
|
||||||
receive_link_graph(link, connection, depth - 1)
|
try:
|
||||||
|
receive_link_graph(link, connection, depth - 1)
|
||||||
|
except NoMoreProxiesException as e:
|
||||||
|
logger.exception("All proxies are blocked")
|
||||||
|
# Wikipedia blocked all our proxies.
|
||||||
|
# Retry later, so we have to store our list that is still to fetch.
|
||||||
|
|
||||||
|
cursor = connection.cursor()
|
||||||
|
cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(?, ?)", (link, depth - 1))
|
||||||
|
connection.commit()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user