basically mysql is working
This commit is contained in:
parent
c958e44632
commit
7cde3c597a
|
@ -31,16 +31,17 @@ def get_cache(directory, name):
|
|||
, user=config["mysql_user"]
|
||||
, password=config["mysql_password"]
|
||||
, db=config["mysql_database"]
|
||||
, charset="utf8")
|
||||
, charset="utf8mb4")
|
||||
|
||||
cursor = db.cursor()
|
||||
|
||||
cursor.execute("CREATE TABLE IF NOT EXISTS proxies(proxy varchar(100), lasttime_could_not_be_used DECIMAL)")
|
||||
cursor.execute("CREATE TABLE IF NOT EXISTS links(source varchar(50), destination varchar(50))")
|
||||
cursor.execute("CREATE TABLE IF NOT EXISTS dijkstra_helper(name varchar(50) UNIQUE, value INT)")
|
||||
cursor.execute("CREATE TABLE IF NOT EXISTS failed_to_fetch(title varchar(50), depth INT)")
|
||||
cursor.execute("CREATE TABLE IF NOT EXISTS links(source varchar(100) CHARACTER SET utf8mb4, destination varchar(100) CHARACTER SET utf8mb4)")
|
||||
cursor.execute("CREATE TABLE IF NOT EXISTS dijkstra_helper(name varchar(100) CHARACTER SET utf8mb4 UNIQUE, value INT)")
|
||||
cursor.execute("CREATE TABLE IF NOT EXISTS failed_to_fetch(title varchar(100) CHARACTER SET utf8mb4, depth INT)")
|
||||
|
||||
db.commit()
|
||||
fetch_proxies(db)
|
||||
return db
|
||||
|
||||
|
||||
|
|
|
@ -4,5 +4,5 @@ config = {
|
|||
, "mysql_server": "172.17.0.2"
|
||||
, "mysql_user": "wikipedia"
|
||||
, "mysql_password": "wikipediastuff"
|
||||
, "mysql_database": "wikipedia_link_data"
|
||||
, "mysql_database": "wikipedia_link_db"
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ from cfg import config
|
|||
|
||||
def can_reach(title, connection):
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("SELECT COUNT(destination) FROM links WHERE destination=?", (title, ))
|
||||
cursor.execute("SELECT COUNT(destination) FROM links WHERE destination=%s", (title, ))
|
||||
count = cursor.fetchone()[0]
|
||||
return count > 0
|
||||
|
||||
|
@ -17,20 +17,12 @@ def shortest_path(center, title, connection):
|
|||
path = deque()
|
||||
while(current_title != center):
|
||||
path.append(current_title)
|
||||
if(config["use_sqlite"]):
|
||||
cursor.execute('''SELECT links.source
|
||||
FROM links
|
||||
LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name
|
||||
WHERE links.destination=:title
|
||||
ORDER BY dijkstra_helper.value ASC
|
||||
LIMIT 1''', {"title": current_title})
|
||||
else:
|
||||
cursor.execute('''SELECT links.source
|
||||
FROM links
|
||||
LEFT JOIN dijkstra_helper ON links.destination=dijkstra_helper.name
|
||||
WHERE links.destination=:title
|
||||
SORT BY dijkstra_helper.value ASC
|
||||
LIMIT 1''', {"title": current_title})
|
||||
current_title = cursor.fetchone()[0]
|
||||
return list(reversed(path))
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@ def recursive_dijkstra(titles, value, connection):
|
|||
|
||||
def dijkstra(title, connection):
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("UPDATE dijkstra_helper SET value=0 WHERE name=?", (title,))
|
||||
cursor.execute("UPDATE dijkstra_helper SET value=0 WHERE name=%s", (title,))
|
||||
|
||||
todos = dijkstra_one(title, 1, connection)
|
||||
recursive_dijkstra(todos, 2, connection)
|
||||
|
|
|
@ -18,10 +18,7 @@ class NoMoreProxiesException(Exception):
|
|||
def get_data_with_proxy(url, conn_object, visit_first=None):
|
||||
cursor = conn_object.cursor()
|
||||
# Assume that table name is proxies
|
||||
if(config["use_sqlite"]):
|
||||
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
|
||||
else:
|
||||
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies SORT BY lasttime_could_not_be_used ASC''')
|
||||
headers = {}
|
||||
for i, lasttime_could_not_be_used in cursor:
|
||||
session = requests.Session()
|
||||
|
@ -30,12 +27,12 @@ def get_data_with_proxy(url, conn_object, visit_first=None):
|
|||
response = session.get(url, headers=headers, timeout=3)
|
||||
except:
|
||||
# If proxy is invalid/inactive, update lasttime could not be used and go next proxy
|
||||
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
|
||||
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %d WHERE proxy = %s ''',
|
||||
(time.time(), i))
|
||||
continue
|
||||
# If text is empty, update lasttime could not be used and go next proxy
|
||||
if not response.text or 399 < response.status_code < 600:
|
||||
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
|
||||
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %d WHERE proxy = %s ''',
|
||||
(time.time(), i))
|
||||
continue
|
||||
return response.json()
|
||||
|
@ -62,7 +59,7 @@ def fetch_proxies(connection):
|
|||
url = "http://{}:{}".format(ip_addr, port)
|
||||
|
||||
if(not proxy_is_in_db(url, connection)):
|
||||
cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,))
|
||||
cursor.execute("INSERT INTO proxies VALUES(%s, 0)", (url,))
|
||||
cnt += 1
|
||||
logging.info("added {} new proxies".format(cnt))
|
||||
connection.commit()
|
||||
|
@ -91,8 +88,8 @@ def _get_rows(soup):
|
|||
|
||||
def proxy_is_in_db(url, connection):
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("SELECT proxy FROM proxies WHERE proxy = ?", (url,))
|
||||
return cursor.fetchall() != []
|
||||
cursor.execute("SELECT proxy FROM proxies WHERE proxy = %s", (url,))
|
||||
return bool(cursor.fetchall())
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -4,6 +4,18 @@ from proxy import get_data_with_proxy, NoMoreProxiesException
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def ignore_title(title):
|
||||
ignore_starters = ["Help:"
|
||||
, "Wikipedia:"
|
||||
, "Template:"
|
||||
, "Template_talk:"
|
||||
, "Category:"
|
||||
]
|
||||
for ignore in ignore_starters:
|
||||
if(title.startswith(ignore)):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _receive_links(title, connection):
|
||||
url = construct_url(title)
|
||||
|
||||
|
@ -15,14 +27,22 @@ def _receive_links(title, connection):
|
|||
destination_title = page_data["title"].replace(" ", "_")
|
||||
# avoid 1-loops
|
||||
if(destination_title == title):
|
||||
pass
|
||||
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
|
||||
continue
|
||||
if(ignore_title(title)):
|
||||
continue
|
||||
cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title))
|
||||
yield destination_title
|
||||
|
||||
else:
|
||||
for destination in page_data["links"]:
|
||||
if(ignore_title(title)):
|
||||
continue
|
||||
destination_title = destination["title"].replace(" ", "_")
|
||||
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
|
||||
try:
|
||||
cursor.execute("INSERT INTO links(source, destination) VALUES(%s, %s)", (title, destination_title))
|
||||
except Exception as e:
|
||||
print(destination_title)
|
||||
raise e
|
||||
yield destination_title
|
||||
connection.commit()
|
||||
|
||||
|
@ -36,7 +56,8 @@ def receive_link_graph(title, connection, depth):
|
|||
return
|
||||
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("SELECT COUNT(source) FROM links WHERE source=?", (title,))
|
||||
print(repr(title))
|
||||
cursor.execute("SELECT COUNT(source) FROM links WHERE source=%s", (title,))
|
||||
if(cursor.fetchone()[0] != 0):
|
||||
# we fetched that title already
|
||||
return
|
||||
|
@ -52,7 +73,7 @@ def receive_link_graph(title, connection, depth):
|
|||
# Retry later, so we have to store our list that is still to fetch.
|
||||
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(?, ?)", (link, depth - 1))
|
||||
cursor.execute("INSERT INTO failed_to_fetch(title, depth) VALUES(%s, %d)", (link, depth - 1))
|
||||
connection.commit()
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user