basically mysql is working

This commit is contained in:
Daniel Knüttel 2019-02-15 12:46:32 +01:00
parent c958e44632
commit 7cde3c597a
6 changed files with 46 additions and 35 deletions

View File

@ -31,16 +31,17 @@ def get_cache(directory, name):
, user=config["mysql_user"]
, password=config["mysql_password"]
, db=config["mysql_database"]
, charset="utf8")
, charset="utf8mb4")
cursor = db.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS proxies(proxy varchar(100), lasttime_could_not_be_used DECIMAL)")
cursor.execute("CREATE TABLE IF NOT EXISTS links(source varchar(50), destination varchar(50))")
cursor.execute("CREATE TABLE IF NOT EXISTS dijkstra_helper(name varchar(50) UNIQUE, value INT)")
cursor.execute("CREATE TABLE IF NOT EXISTS failed_to_fetch(title varchar(50), depth INT)")
cursor.execute("CREATE TABLE IF NOT EXISTS links(source varchar(100) CHARACTER SET utf8mb4, destination varchar(100) CHARACTER SET utf8mb4)")
cursor.execute("CREATE TABLE IF NOT EXISTS dijkstra_helper(name varchar(100) CHARACTER SET utf8mb4 UNIQUE, value INT)")
cursor.execute("CREATE TABLE IF NOT EXISTS failed_to_fetch(title varchar(100) CHARACTER SET utf8mb4, depth INT)")
db.commit()
fetch_proxies(db)
return db

View File

@ -4,5 +4,5 @@ config = {
, "mysql_server": "172.17.0.2"
, "mysql_user": "wikipedia"
, "mysql_password": "wikipediastuff"
, "mysql_database": "wikipedia_link_data"
, "mysql_database": "wikipedia_link_db"
}

View File

@ -4,7 +4,7 @@ from cfg import config
def can_reach(title, connection):
cursor = connection.cursor()
cursor.execute("SELECT COUNT(destination) FROM links WHERE destination=?", (title, ))
cursor.execute("SELECT COUNT(destination) FROM links WHERE destination=%s", (title, ))
count = cursor.fetchone()[0]
return count > 0
@ -17,20 +17,12 @@ def shortest_path(center, title, connection):
path = deque()
while(current_title != center):
path.append(current_title)
if(config["use_sqlite"]):
cursor.execute('''SELECT links.source
FROM links
LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name
WHERE links.destination=:title
ORDER BY dijkstra_helper.value ASC
LIMIT 1''', {"title": current_title})
else:
cursor.execute('''SELECT links.source
FROM links
LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name
WHERE links.destination=:title
SORT BY dijkstra_helper.value ASC
LIMIT 1''', {"title": current_title})
current_title = cursor.fetchone()[0]
return list(reversed(path))

View File

@ -50,7 +50,7 @@ def recursive_dijkstra(titles, value, connection):
def dijkstra(title, connection):
cursor = connection.cursor()
cursor.execute("UPDATE dijkstra_helper SET value=0 WHERE name=?", (title,))
cursor.execute("UPDATE dijkstra_helper SET value=0 WHERE name=%s", (title,))
todos = dijkstra_one(title, 1, connection)
recursive_dijkstra(todos, 2, connection)

View File

@ -18,10 +18,7 @@ class NoMoreProxiesException(Exception):
def get_data_with_proxy(url, conn_object, visit_first=None):
cursor = conn_object.cursor()
# Assume that table name is proxies
if(config["use_sqlite"]):
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
else:
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies SORT BY lasttime_could_not_be_used ASC''')
headers = {}
for i, lasttime_could_not_be_used in cursor:
session = requests.Session()
@ -30,12 +27,12 @@ def get_data_with_proxy(url, conn_object, visit_first=None):
response = session.get(url, headers=headers, timeout=3)
except:
# If proxy is invalid/inactive, update lasttime could not be used and go next proxy
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %d WHERE proxy = %s ''',
(time.time(), i))
continue
# If text is empty, update lasttime could not be used and go next proxy
if not response.text or 399 < response.status_code < 600:
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %d WHERE proxy = %s ''',
(time.time(), i))
continue
return response.json()
@ -62,7 +59,7 @@ def fetch_proxies(connection):
url = "http://{}:{}".format(ip_addr, port)
if(not proxy_is_in_db(url, connection)):
cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,))
cursor.execute("INSERT INTO proxies VALUES(%s, 0)", (url,))
cnt += 1
logging.info("added {} new proxies".format(cnt))
connection.commit()
@ -91,8 +88,8 @@ def _get_rows(soup):
def proxy_is_in_db(url, connection):
cursor = connection.cursor()
cursor.execute("SELECT proxy FROM proxies WHERE proxy = ?", (url,))
return cursor.fetchall() != []
cursor.execute("SELECT proxy FROM proxies WHERE proxy = %s", (url,))
return bool(cursor.fetchall())

View File

@ -4,6 +4,18 @@ from proxy import get_data_with_proxy, NoMoreProxiesException
logger = logging.getLogger(__name__)
def ignore_title(title):
ignore_starters = ["Help:"
, "Wikipedia:"
, "Template:"
, "Template_talk:"
, "Category:"
]
for ignore in ignore_starters:
if(title.startswith(ignore)):
return True
return False
def _receive_links(title, connection):
url = construct_url(title)
@ -15,14 +27,22 @@ def _receive_links(title, connection):
destination_title = page_data["title"].replace(" ", "_")
# avoid 1-loops
if(destination_title == title):
pass
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
continue
if(ignore_title(title)):
continue
cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title))
yield destination_title
else:
for destination in page_data["links"]:
if(ignore_title(title)):
continue
destination_title = destination["title"].replace(" ", "_")
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
try:
cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title))
except Exception as e:
print(destination_title)
raise e
yield destination_title
connection.commit()
@ -36,7 +56,8 @@ def receive_link_graph(title, connection, depth):
return
cursor = connection.cursor()
cursor.execute("SELECT COUNT(source) FROM links WHERE source=?", (title,))
print(repr(title))
cursor.execute("SELECT COUNT(source) FROM links WHERE source=%s", (title,))
if(cursor.fetchone()[0] != 0):
# we fetched that title already
return
@ -52,7 +73,7 @@ def receive_link_graph(title, connection, depth):
# Retry later, so we have to store our list that is still to fetch.
cursor = connection.cursor()
cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(?, ?)", (link, depth - 1))
cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(%s, %d)", (link, depth - 1))
connection.commit()