added mysql support (in theory)

This commit is contained in:
Daniel Knüttel 2019-02-15 11:47:50 +01:00
parent 635a9f7739
commit c958e44632
5 changed files with 71 additions and 24 deletions

View File

@ -1,9 +1,14 @@
import os
import sqlite3
from cfg import config
if(not config["use_sqlite"]):
import pymysql
from proxy import fetch_proxies
def get_cache(directory, name):
if(config["use_sqlite"]):
cache_file = os.path.join(directory, "{}.sqlite".format(name))
if(not os.path.exists(cache_file)):
with open(cache_file, "w") as fin:
@ -21,6 +26,23 @@ def get_cache(directory, name):
db = sqlite3.connect(cache_file)
fetch_proxies(db)
return db
db = pymysql.connect(
host=config["mysql_server"]
, user=config["mysql_user"]
, password=config["mysql_password"]
, db=config["mysql_database"]
, charset="utf8")
cursor = db.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS proxies(proxy varchar(100), lasttime_could_not_be_used DECIMAL)")
cursor.execute("CREATE TABLE IF NOT EXISTS links(source varchar(50), destination varchar(50))")
cursor.execute("CREATE TABLE IF NOT EXISTS dijkstra_helper(name varchar(50) UNIQUE, value INT)")
cursor.execute("CREATE TABLE IF NOT EXISTS failed_to_fetch(title varchar(50), depth INT)")
db.commit()
return db
def clear_cache_data(connection):
cursor = connection.cursor()

View File

@ -1,5 +1,7 @@
from collections import deque
from cfg import config
def can_reach(title, connection):
cursor = connection.cursor()
cursor.execute("SELECT COUNT(destination) FROM links WHERE destination=?", (title, ))
@ -15,12 +17,20 @@ def shortest_path(center, title, connection):
path = deque()
while(current_title != center):
path.append(current_title)
if(config["use_sqlite"]):
cursor.execute('''SELECT links.source
FROM links
LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name
WHERE links.destination=:title
ORDER BY dijkstra_helper.value ASC
LIMIT 1''', {"title": current_title})
else:
cursor.execute('''SELECT links.source
FROM links
LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name
WHERE links.destination=:title
SORT BY dijkstra_helper.value ASC
LIMIT 1''', {"title": current_title})
current_title = cursor.fetchone()[0]
return list(reversed(path))

View File

@ -1,3 +1,6 @@
from collections import deque
from cfg import config
def prepare_dijkstra(connection):
cursor = connection.cursor()
@ -5,7 +8,10 @@ def prepare_dijkstra(connection):
SELECT destination FROM links
''')
if(config["use_sqlite"]):
cursor.execute("UPDATE dijkstra_helper SET value=1e1000")
else:
cursor.execute("UPDATE dijkstra_helper SET value=2147483647")
connection.commit()
def dijkstra_one(title, value, connection):
@ -35,8 +41,10 @@ def recursive_dijkstra(titles, value, connection):
if(not titles):
return
todos = deque()
for title in titles:
todos = dijkstra_one(title, value, connection)
todos.extend(dijkstra_one(title, value, connection))
recursive_dijkstra(todos, value + 1, connection)

View File

@ -6,6 +6,7 @@ from cache import get_cache
from receive import receive_links, receive_link_graph
from dijkstra import prepare_dijkstra, dijkstra
from connectivity import shortest_path
cache = get_cache("./cache/", "Angela_Merkel")
receive_link_graph("Angela_Merkel", cache, 2)
@ -14,6 +15,7 @@ cursor = cache.cursor()
cursor.execute("SELECT COUNT(source) FROM links")
print(cursor.fetchall())
prepare_dijkstra(cache)
dijkstra("Angela_Merkel", cache)
#prepare_dijkstra(cache)
#dijkstra("Angela_Merkel", cache)
print(shortest_path("Angela_Merkel", "Germany", cache))

View File

@ -8,6 +8,8 @@ import logging
import time
from bs4 import BeautifulSoup
from cfg import config
logger = logging.getLogger(__name__)
class NoMoreProxiesException(Exception):
@ -16,7 +18,10 @@ class NoMoreProxiesException(Exception):
def get_data_with_proxy(url, conn_object, visit_first=None):
cursor = conn_object.cursor()
# Assume that table name is proxies
if(config["use_sqlite"]):
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
else:
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies SORT BY lasttime_could_not_be_used ASC''')
headers = {}
for i, lasttime_could_not_be_used in cursor:
session = requests.Session()