basically mysql is working

This commit is contained in:
Daniel Knüttel 2019-02-15 12:46:32 +01:00
parent c958e44632
commit 7cde3c597a
6 changed files with 46 additions and 35 deletions

View File

@ -31,16 +31,17 @@ def get_cache(directory, name):
, user=config["mysql_user"] , user=config["mysql_user"]
, password=config["mysql_password"] , password=config["mysql_password"]
, db=config["mysql_database"] , db=config["mysql_database"]
, charset="utf8") , charset="utf8mb4")
cursor = db.cursor() cursor = db.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS proxies(proxy varchar(100), lasttime_could_not_be_used DECIMAL)") cursor.execute("CREATE TABLE IF NOT EXISTS proxies(proxy varchar(100), lasttime_could_not_be_used DECIMAL)")
cursor.execute("CREATE TABLE IF NOT EXISTS links(source varchar(50), destination varchar(50))") cursor.execute("CREATE TABLE IF NOT EXISTS links(source varchar(100) CHARACTER SET utf8mb4, destination varchar(100) CHARACTER SET utf8mb4)")
cursor.execute("CREATE TABLE IF NOT EXISTS dijkstra_helper(name varchar(50) UNIQUE, value INT)") cursor.execute("CREATE TABLE IF NOT EXISTS dijkstra_helper(name varchar(100) CHARACTER SET utf8mb4 UNIQUE, value INT)")
cursor.execute("CREATE TABLE IF NOT EXISTS failed_to_fetch(title varchar(50), depth INT)") cursor.execute("CREATE TABLE IF NOT EXISTS failed_to_fetch(title varchar(100) CHARACTER SET utf8mb4, depth INT)")
db.commit() db.commit()
fetch_proxies(db)
return db return db

View File

@ -4,5 +4,5 @@ config = {
, "mysql_server": "172.17.0.2" , "mysql_server": "172.17.0.2"
, "mysql_user": "wikipedia" , "mysql_user": "wikipedia"
, "mysql_password": "wikipediastuff" , "mysql_password": "wikipediastuff"
, "mysql_database": "wikipedia_link_data" , "mysql_database": "wikipedia_link_db"
} }

View File

@ -4,7 +4,7 @@ from cfg import config
def can_reach(title, connection): def can_reach(title, connection):
cursor = connection.cursor() cursor = connection.cursor()
cursor.execute("SELECT COUNT(destination) FROM links WHERE destination=?", (title, )) cursor.execute("SELECT COUNT(destination) FROM links WHERE destination=%s", (title, ))
count = cursor.fetchone()[0] count = cursor.fetchone()[0]
return count > 0 return count > 0
@ -17,20 +17,12 @@ def shortest_path(center, title, connection):
path = deque() path = deque()
while(current_title != center): while(current_title != center):
path.append(current_title) path.append(current_title)
if(config["use_sqlite"]): cursor.execute('''SELECT links.source
cursor.execute('''SELECT links.source FROM links
FROM links LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name
LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name WHERE links.destination=:title
WHERE links.destination=:title ORDER BY dijkstra_helper.value ASC
ORDER BY dijkstra_helper.value ASC LIMIT 1''', {"title": current_title})
LIMIT 1''', {"title": current_title})
else:
cursor.execute('''SELECT links.source
FROM links
LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name
WHERE links.destination=:title
SORT BY dijkstra_helper.value ASC
LIMIT 1''', {"title": current_title})
current_title = cursor.fetchone()[0] current_title = cursor.fetchone()[0]
return list(reversed(path)) return list(reversed(path))

View File

@ -50,7 +50,7 @@ def recursive_dijkstra(titles, value, connection):
def dijkstra(title, connection): def dijkstra(title, connection):
cursor = connection.cursor() cursor = connection.cursor()
cursor.execute("UPDATE dijkstra_helper SET value=0 WHERE name=?", (title,)) cursor.execute("UPDATE dijkstra_helper SET value=0 WHERE name=%s", (title,))
todos = dijkstra_one(title, 1, connection) todos = dijkstra_one(title, 1, connection)
recursive_dijkstra(todos, 2, connection) recursive_dijkstra(todos, 2, connection)

View File

@ -18,10 +18,7 @@ class NoMoreProxiesException(Exception):
def get_data_with_proxy(url, conn_object, visit_first=None): def get_data_with_proxy(url, conn_object, visit_first=None):
cursor = conn_object.cursor() cursor = conn_object.cursor()
# Assume that table name is proxies # Assume that table name is proxies
if(config["use_sqlite"]): cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
else:
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies SORT BY lasttime_could_not_be_used ASC''')
headers = {} headers = {}
for i, lasttime_could_not_be_used in cursor: for i, lasttime_could_not_be_used in cursor:
session = requests.Session() session = requests.Session()
@ -30,12 +27,12 @@ def get_data_with_proxy(url, conn_object, visit_first=None):
response = session.get(url, headers=headers, timeout=3) response = session.get(url, headers=headers, timeout=3)
except: except:
# If proxy is invalid/inactive, update lasttime could not be used and go next proxy # If proxy is invalid/inactive, update lasttime could not be used and go next proxy
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''', cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %d WHERE proxy = %s ''',
(time.time(), i)) (time.time(), i))
continue continue
# If text is empty, update lasttime could not be used and go next proxy # If text is empty, update lasttime could not be used and go next proxy
if not response.text or 399 < response.status_code < 600: if not response.text or 399 < response.status_code < 600:
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''', cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %d WHERE proxy = %s ''',
(time.time(), i)) (time.time(), i))
continue continue
return response.json() return response.json()
@ -62,7 +59,7 @@ def fetch_proxies(connection):
url = "http://{}:{}".format(ip_addr, port) url = "http://{}:{}".format(ip_addr, port)
if(not proxy_is_in_db(url, connection)): if(not proxy_is_in_db(url, connection)):
cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,)) cursor.execute("INSERT INTO proxies VALUES(%s, 0)", (url,))
cnt += 1 cnt += 1
logging.info("added {} new proxies".format(cnt)) logging.info("added {} new proxies".format(cnt))
connection.commit() connection.commit()
@ -91,8 +88,8 @@ def _get_rows(soup):
def proxy_is_in_db(url, connection): def proxy_is_in_db(url, connection):
cursor = connection.cursor() cursor = connection.cursor()
cursor.execute("SELECT proxy FROM proxies WHERE proxy = ?", (url,)) cursor.execute("SELECT proxy FROM proxies WHERE proxy = %s", (url,))
return cursor.fetchall() != [] return bool(cursor.fetchall())

View File

@ -4,6 +4,18 @@ from proxy import get_data_with_proxy, NoMoreProxiesException
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def ignore_title(title):
ignore_starters = ["Help:"
, "Wikipedia:"
, "Template:"
, "Template_talk:"
, "Category:"
]
for ignore in ignore_starters:
if(title.startswith(ignore)):
return True
return False
def _receive_links(title, connection): def _receive_links(title, connection):
url = construct_url(title) url = construct_url(title)
@ -15,14 +27,22 @@ def _receive_links(title, connection):
destination_title = page_data["title"].replace(" ", "_") destination_title = page_data["title"].replace(" ", "_")
# avoid 1-loops # avoid 1-loops
if(destination_title == title): if(destination_title == title):
pass continue
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title)) if(ignore_title(title)):
continue
cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title))
yield destination_title yield destination_title
else: else:
for destination in page_data["links"]: for destination in page_data["links"]:
if(ignore_title(title)):
continue
destination_title = destination["title"].replace(" ", "_") destination_title = destination["title"].replace(" ", "_")
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title)) try:
cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title))
except Exception as e:
print(destination_title)
raise e
yield destination_title yield destination_title
connection.commit() connection.commit()
@ -36,7 +56,8 @@ def receive_link_graph(title, connection, depth):
return return
cursor = connection.cursor() cursor = connection.cursor()
cursor.execute("SELECT COUNT(source) FROM links WHERE source=?", (title,)) print(repr(title))
cursor.execute("SELECT COUNT(source) FROM links WHERE source=%s", (title,))
if(cursor.fetchone()[0] != 0): if(cursor.fetchone()[0] != 0):
# we fetched that title already # we fetched that title already
return return
@ -52,7 +73,7 @@ def receive_link_graph(title, connection, depth):
# Retry later, so we have to store our list that is still to fetch. # Retry later, so we have to store our list that is still to fetch.
cursor = connection.cursor() cursor = connection.cursor()
cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(?, ?)", (link, depth - 1)) cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(%s, %d)", (link, depth - 1))
connection.commit() connection.commit()