diff --git a/exam/ex01/cache.py b/exam/ex01/cache.py new file mode 100644 index 0000000..0dc0c80 --- /dev/null +++ b/exam/ex01/cache.py @@ -0,0 +1,28 @@ +import os +import sqlite3 + +from proxy import fetch_proxies + +def get_cache(directory): + cache_file = os.path.join(directory, "cache.sqlite") + if(not os.path.exists(cache_file)): + with open(cache_file, "w") as fin: + pass + db = sqlite3.connect(cache_file) + + cursor = db.cursor() + + cursor.execute("CREATE TABLE proxies(proxy TEXT, lasttime_could_not_be_used DECIMAL)") + cursor.execute("CREATE TABLE links(source TEXT, destination TEXT)") + cursor.execute("CREATE TABLE dijkstra_helper(name TEXT, value INT)") + + db.commit() + db = sqlite3.connect(cache_file) + fetch_proxies(db) + return db + +def clear_cache_data(connection): + cursor = connection.cursor() + cursor.execute("DELETE FROM links") + cursor.execute("DELETE FROM dijkstra_helper") + connection.commit() diff --git a/exam/ex01/main.py b/exam/ex01/main.py new file mode 100644 index 0000000..5c9a08a --- /dev/null +++ b/exam/ex01/main.py @@ -0,0 +1,9 @@ +import logging + +logging.basicConfig(level=logging.DEBUG) + +from cache import get_cache +from receive import receive_links, receive_link_graph + +cache = get_cache("./cache/") +receive_link_graph("Angela_Merkel", cache, 3) diff --git a/exam/ex01/proxy.py b/exam/ex01/proxy.py new file mode 100644 index 0000000..61d383a --- /dev/null +++ b/exam/ex01/proxy.py @@ -0,0 +1,88 @@ +""" +Module to fetch new proxies from +https://www.proxynova.com/proxy-server-list/country-de/ +""" + +import requests +import logging +import time +from bs4 import BeautifulSoup + +logger = logging.getLogger(__name__) + +def get_data_with_proxy(url, conn_object, visit_first=None): + cursor = conn_object.cursor() + # Assume that table name is proxies + cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''') + headers = {} + for i, lasttime_could_not_be_used in cursor: + session = requests.Session() + session.proxies = { 'http': i} + try: + response = session.get(url, headers=headers, timeout=3) + except: + # If proxy is invalid/inactive, update lasttime could not be used and go next proxy + cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''', + (time.time(), i)) + continue + # If text is empty, update lasttime could not be used and go next proxy + if not response.text or 399 < response.status_code < 600: + cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''', + (time.time(), i)) + continue + return response.json() + raise Exception("No more proxies left") + + +def fetch_proxies(connection): + """ + Fetch new proxies from https://us-proxy.org/ and put them + in the database ``connection``. + """ + resp = requests.request(url="https://www.proxynova.com/proxy-server-list/country-de/", method="GET") + logger.info("request status code: {}; elapsed us: {}".format(resp.status_code, + resp.elapsed.microseconds)) + if(resp.status_code != 200): + logger.error("status code is not 200") + raise Exception("failed to retrieve proxy list") + soup = BeautifulSoup(resp.text, "html.parser") + + cursor = connection.cursor() + for i, (ip_addr, port) in enumerate(_get_rows(soup)): + + url = "http://{}:{}".format(ip_addr, port) + + if(not proxy_is_in_db(url, connection)): + cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,)) + logging.info("added {} new proxies".format(i)) + connection.commit() + + + + +def _get_rows(soup): + for i, row in enumerate(soup.findAll("tr")): + # first row is a header + if(i == 0): + continue + try: + columns = row.findAll("td") + ip_addr, port = [i.get_text() for i in columns[0:2]] + port = port.strip() + # Well they thought that they kinda obfuscated that. + # Unfortunately their obfuscation is crap and so this bypasses + # it. + ip_addr = ip_addr[25:30] + ip_addr[45:-5] + yield ip_addr, port + except Exception as e: + break + logger.info("retrieved {} proxies".format(i)) + + +def proxy_is_in_db(url, connection): + cursor = connection.cursor() + cursor.execute("SELECT proxy FROM proxies WHERE proxy = ?", (url,)) + return cursor.fetchall() != [] + + + diff --git a/exam/ex01/receive.py b/exam/ex01/receive.py new file mode 100644 index 0000000..876b4ea --- /dev/null +++ b/exam/ex01/receive.py @@ -0,0 +1,49 @@ +import logging +from url import construct_url +from proxy import get_data_with_proxy + +logger = logging.getLogger(__name__) + +def _receive_links(title, connection): + url = construct_url(title) + + result = get_data_with_proxy(url, connection) + # This is basically because we don't know the page ID. + for k, page_data in result["query"]["pages"].items(): + cursor = connection.cursor() + if(not "links" in page_data): + destination_title = page_data["title"].replace(" ", "_") + # avoid 1-loops + if(destination_title == title): + pass + cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title)) + yield destination_title + + else: + for destination in page_data["links"]: + destination_title = destination["title"].replace(" ", "_") + cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title)) + yield destination_title + connection.commit() + +def receive_links(title, connection): + return list(_receive_links(title, connection)) + + +def receive_link_graph(title, connection, depth): + if(depth < 0): + # end of recursion + return + + cursor = connection.cursor() + cursor.execute("SELECT COUNT(source) FROM links WHERE source=?", (title,)) + if(cursor.fetchone()[0] != 0): + # we fetched that title already + return + + logger.info("fetching links for {}".format(title)) + + for link in _receive_links(title, connection): + receive_link_graph(link, connection, depth - 1) + + diff --git a/exam/ex01/url.py b/exam/ex01/url.py new file mode 100644 index 0000000..bc83561 --- /dev/null +++ b/exam/ex01/url.py @@ -0,0 +1,3 @@ + +def construct_url(title): + return "https://en.wikipedia.org/w/api.php?action=query&prop=links&pllimit=500&titles={}&format=json".format(title)