basically mysql is working
This commit is contained in:
parent
c958e44632
commit
7cde3c597a
|
@ -31,16 +31,17 @@ def get_cache(directory, name):
|
||||||
, user=config["mysql_user"]
|
, user=config["mysql_user"]
|
||||||
, password=config["mysql_password"]
|
, password=config["mysql_password"]
|
||||||
, db=config["mysql_database"]
|
, db=config["mysql_database"]
|
||||||
, charset="utf8")
|
, charset="utf8mb4")
|
||||||
|
|
||||||
cursor = db.cursor()
|
cursor = db.cursor()
|
||||||
|
|
||||||
cursor.execute("CREATE TABLE IF NOT EXISTS proxies(proxy varchar(100), lasttime_could_not_be_used DECIMAL)")
|
cursor.execute("CREATE TABLE IF NOT EXISTS proxies(proxy varchar(100), lasttime_could_not_be_used DECIMAL)")
|
||||||
cursor.execute("CREATE TABLE IF NOT EXISTS links(source varchar(50), destination varchar(50))")
|
cursor.execute("CREATE TABLE IF NOT EXISTS links(source varchar(100) CHARACTER SET utf8mb4, destination varchar(100) CHARACTER SET utf8mb4)")
|
||||||
cursor.execute("CREATE TABLE IF NOT EXISTS dijkstra_helper(name varchar(50) UNIQUE, value INT)")
|
cursor.execute("CREATE TABLE IF NOT EXISTS dijkstra_helper(name varchar(100) CHARACTER SET utf8mb4 UNIQUE, value INT)")
|
||||||
cursor.execute("CREATE TABLE IF NOT EXISTS failed_to_fetch(title varchar(50), depth INT)")
|
cursor.execute("CREATE TABLE IF NOT EXISTS failed_to_fetch(title varchar(100) CHARACTER SET utf8mb4, depth INT)")
|
||||||
|
|
||||||
db.commit()
|
db.commit()
|
||||||
|
fetch_proxies(db)
|
||||||
return db
|
return db
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,5 +4,5 @@ config = {
|
||||||
, "mysql_server": "172.17.0.2"
|
, "mysql_server": "172.17.0.2"
|
||||||
, "mysql_user": "wikipedia"
|
, "mysql_user": "wikipedia"
|
||||||
, "mysql_password": "wikipediastuff"
|
, "mysql_password": "wikipediastuff"
|
||||||
, "mysql_database": "wikipedia_link_data"
|
, "mysql_database": "wikipedia_link_db"
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@ from cfg import config
|
||||||
|
|
||||||
def can_reach(title, connection):
|
def can_reach(title, connection):
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
cursor.execute("SELECT COUNT(destination) FROM links WHERE destination=?", (title, ))
|
cursor.execute("SELECT COUNT(destination) FROM links WHERE destination=%s", (title, ))
|
||||||
count = cursor.fetchone()[0]
|
count = cursor.fetchone()[0]
|
||||||
return count > 0
|
return count > 0
|
||||||
|
|
||||||
|
@ -17,20 +17,12 @@ def shortest_path(center, title, connection):
|
||||||
path = deque()
|
path = deque()
|
||||||
while(current_title != center):
|
while(current_title != center):
|
||||||
path.append(current_title)
|
path.append(current_title)
|
||||||
if(config["use_sqlite"]):
|
|
||||||
cursor.execute('''SELECT links.source
|
cursor.execute('''SELECT links.source
|
||||||
FROM links
|
FROM links
|
||||||
LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name
|
LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name
|
||||||
WHERE links.destination=:title
|
WHERE links.destination=:title
|
||||||
ORDER BY dijkstra_helper.value ASC
|
ORDER BY dijkstra_helper.value ASC
|
||||||
LIMIT 1''', {"title": current_title})
|
LIMIT 1''', {"title": current_title})
|
||||||
else:
|
|
||||||
cursor.execute('''SELECT links.source
|
|
||||||
FROM links
|
|
||||||
LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name
|
|
||||||
WHERE links.destination=:title
|
|
||||||
SORT BY dijkstra_helper.value ASC
|
|
||||||
LIMIT 1''', {"title": current_title})
|
|
||||||
current_title = cursor.fetchone()[0]
|
current_title = cursor.fetchone()[0]
|
||||||
return list(reversed(path))
|
return list(reversed(path))
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,7 @@ def recursive_dijkstra(titles, value, connection):
|
||||||
|
|
||||||
def dijkstra(title, connection):
|
def dijkstra(title, connection):
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
cursor.execute("UPDATE dijkstra_helper SET value=0 WHERE name=?", (title,))
|
cursor.execute("UPDATE dijkstra_helper SET value=0 WHERE name=%s", (title,))
|
||||||
|
|
||||||
todos = dijkstra_one(title, 1, connection)
|
todos = dijkstra_one(title, 1, connection)
|
||||||
recursive_dijkstra(todos, 2, connection)
|
recursive_dijkstra(todos, 2, connection)
|
||||||
|
|
|
@ -18,10 +18,7 @@ class NoMoreProxiesException(Exception):
|
||||||
def get_data_with_proxy(url, conn_object, visit_first=None):
|
def get_data_with_proxy(url, conn_object, visit_first=None):
|
||||||
cursor = conn_object.cursor()
|
cursor = conn_object.cursor()
|
||||||
# Assume that table name is proxies
|
# Assume that table name is proxies
|
||||||
if(config["use_sqlite"]):
|
|
||||||
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
|
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
|
||||||
else:
|
|
||||||
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies SORT BY lasttime_could_not_be_used ASC''')
|
|
||||||
headers = {}
|
headers = {}
|
||||||
for i, lasttime_could_not_be_used in cursor:
|
for i, lasttime_could_not_be_used in cursor:
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
|
@ -30,12 +27,12 @@ def get_data_with_proxy(url, conn_object, visit_first=None):
|
||||||
response = session.get(url, headers=headers, timeout=3)
|
response = session.get(url, headers=headers, timeout=3)
|
||||||
except:
|
except:
|
||||||
# If proxy is invalid/inactive, update lasttime could not be used and go next proxy
|
# If proxy is invalid/inactive, update lasttime could not be used and go next proxy
|
||||||
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
|
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %d WHERE proxy = %s ''',
|
||||||
(time.time(), i))
|
(time.time(), i))
|
||||||
continue
|
continue
|
||||||
# If text is empty, update lasttime could not be used and go next proxy
|
# If text is empty, update lasttime could not be used and go next proxy
|
||||||
if not response.text or 399 < response.status_code < 600:
|
if not response.text or 399 < response.status_code < 600:
|
||||||
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
|
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %d WHERE proxy = %s ''',
|
||||||
(time.time(), i))
|
(time.time(), i))
|
||||||
continue
|
continue
|
||||||
return response.json()
|
return response.json()
|
||||||
|
@ -62,7 +59,7 @@ def fetch_proxies(connection):
|
||||||
url = "http://{}:{}".format(ip_addr, port)
|
url = "http://{}:{}".format(ip_addr, port)
|
||||||
|
|
||||||
if(not proxy_is_in_db(url, connection)):
|
if(not proxy_is_in_db(url, connection)):
|
||||||
cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,))
|
cursor.execute("INSERT INTO proxies VALUES(%s, 0)", (url,))
|
||||||
cnt += 1
|
cnt += 1
|
||||||
logging.info("added {} new proxies".format(cnt))
|
logging.info("added {} new proxies".format(cnt))
|
||||||
connection.commit()
|
connection.commit()
|
||||||
|
@ -91,8 +88,8 @@ def _get_rows(soup):
|
||||||
|
|
||||||
def proxy_is_in_db(url, connection):
|
def proxy_is_in_db(url, connection):
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
cursor.execute("SELECT proxy FROM proxies WHERE proxy = ?", (url,))
|
cursor.execute("SELECT proxy FROM proxies WHERE proxy = %s", (url,))
|
||||||
return cursor.fetchall() != []
|
return bool(cursor.fetchall())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,18 @@ from proxy import get_data_with_proxy, NoMoreProxiesException
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def ignore_title(title):
|
||||||
|
ignore_starters = ["Help:"
|
||||||
|
, "Wikipedia:"
|
||||||
|
, "Template:"
|
||||||
|
, "Template_talk:"
|
||||||
|
, "Category:"
|
||||||
|
]
|
||||||
|
for ignore in ignore_starters:
|
||||||
|
if(title.startswith(ignore)):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def _receive_links(title, connection):
|
def _receive_links(title, connection):
|
||||||
url = construct_url(title)
|
url = construct_url(title)
|
||||||
|
|
||||||
|
@ -15,14 +27,22 @@ def _receive_links(title, connection):
|
||||||
destination_title = page_data["title"].replace(" ", "_")
|
destination_title = page_data["title"].replace(" ", "_")
|
||||||
# avoid 1-loops
|
# avoid 1-loops
|
||||||
if(destination_title == title):
|
if(destination_title == title):
|
||||||
pass
|
continue
|
||||||
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
|
if(ignore_title(title)):
|
||||||
|
continue
|
||||||
|
cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title))
|
||||||
yield destination_title
|
yield destination_title
|
||||||
|
|
||||||
else:
|
else:
|
||||||
for destination in page_data["links"]:
|
for destination in page_data["links"]:
|
||||||
|
if(ignore_title(title)):
|
||||||
|
continue
|
||||||
destination_title = destination["title"].replace(" ", "_")
|
destination_title = destination["title"].replace(" ", "_")
|
||||||
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
|
try:
|
||||||
|
cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title))
|
||||||
|
except Exception as e:
|
||||||
|
print(destination_title)
|
||||||
|
raise e
|
||||||
yield destination_title
|
yield destination_title
|
||||||
connection.commit()
|
connection.commit()
|
||||||
|
|
||||||
|
@ -36,7 +56,8 @@ def receive_link_graph(title, connection, depth):
|
||||||
return
|
return
|
||||||
|
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
cursor.execute("SELECT COUNT(source) FROM links WHERE source=?", (title,))
|
print(repr(title))
|
||||||
|
cursor.execute("SELECT COUNT(source) FROM links WHERE source=%s", (title,))
|
||||||
if(cursor.fetchone()[0] != 0):
|
if(cursor.fetchone()[0] != 0):
|
||||||
# we fetched that title already
|
# we fetched that title already
|
||||||
return
|
return
|
||||||
|
@ -52,7 +73,7 @@ def receive_link_graph(title, connection, depth):
|
||||||
# Retry later, so we have to store our list that is still to fetch.
|
# Retry later, so we have to store our list that is still to fetch.
|
||||||
|
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(?, ?)", (link, depth - 1))
|
cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(%s, %d)", (link, depth - 1))
|
||||||
connection.commit()
|
connection.commit()
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user