initial stuff for ex01

2019-02-02 11:18:57 +01:00
parent 36f4ee2501
commit 783ec462e3
5 changed files with 177 additions and 0 deletions
--- a/exam/ex01/cache.py
+++ b/exam/ex01/cache.py
@@ -0,0 +1,28 @@
+import os
+import sqlite3
+
+from proxy import fetch_proxies
+
+def get_cache(directory):
+	cache_file = os.path.join(directory, "cache.sqlite")
+	if(not os.path.exists(cache_file)):
+		with open(cache_file, "w") as fin:
+			pass
+		db = sqlite3.connect(cache_file)
+
+		cursor = db.cursor()
+
+		cursor.execute("CREATE TABLE proxies(proxy TEXT, lasttime_could_not_be_used DECIMAL)")
+		cursor.execute("CREATE TABLE links(source TEXT, destination TEXT)")
+		cursor.execute("CREATE TABLE dijkstra_helper(name TEXT, value INT)")
+
+		db.commit()
+	db = sqlite3.connect(cache_file)
+	fetch_proxies(db)
+	return db
+
+def clear_cache_data(connection):
+	cursor = connection.cursor()
+	cursor.execute("DELETE FROM links")
+	cursor.execute("DELETE FROM dijkstra_helper")
+	connection.commit()
--- a/exam/ex01/main.py
+++ b/exam/ex01/main.py
@@ -0,0 +1,9 @@
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+
+from cache import get_cache
+from receive import receive_links, receive_link_graph
+
+cache = get_cache("./cache/")
+receive_link_graph("Angela_Merkel", cache, 3)
--- a/exam/ex01/proxy.py
+++ b/exam/ex01/proxy.py
@@ -0,0 +1,88 @@
+"""
+Module to fetch new proxies from 
+https://www.proxynova.com/proxy-server-list/country-de/
+"""
+
+import requests
+import logging
+import time
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger(__name__)
+
+def get_data_with_proxy(url, conn_object, visit_first=None):
+	cursor = conn_object.cursor()
+	# Assume that table name is proxies
+	cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
+	headers = {}
+	for i, lasttime_could_not_be_used in cursor:
+		session = requests.Session()
+		session.proxies = { 'http': i}
+		try:
+			response = session.get(url, headers=headers, timeout=3)
+		except:
+		# If proxy is invalid/inactive, update lasttime could not be used and go next proxy
+			cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
+					 (time.time(), i))
+			continue
+		# If text is empty, update lasttime could not be used and go next proxy
+		if not response.text or 399 < response.status_code < 600:
+			cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
+					 (time.time(), i))
+			continue
+		return response.json()
+	raise Exception("No more proxies left")
+
+
+def fetch_proxies(connection):
+	"""
+	Fetch new proxies from https://us-proxy.org/ and put them
+	in the database ``connection``.
+	"""
+	resp = requests.request(url="https://www.proxynova.com/proxy-server-list/country-de/", method="GET")
+	logger.info("request status code: {}; elapsed us: {}".format(resp.status_code,
+				resp.elapsed.microseconds))
+	if(resp.status_code != 200):
+		logger.error("status code is not 200")
+		raise Exception("failed to retrieve proxy list")
+	soup = BeautifulSoup(resp.text, "html.parser")
+
+	cursor = connection.cursor()
+	for i, (ip_addr, port) in enumerate(_get_rows(soup)):
+
+		url = "http://{}:{}".format(ip_addr, port)
+
+		if(not proxy_is_in_db(url, connection)):
+			cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,))
+	logging.info("added {} new proxies".format(i))
+	connection.commit()
+
+
+		
+
+def _get_rows(soup):
+	for i, row in enumerate(soup.findAll("tr")):
+		# first row is a header
+		if(i == 0):
+			continue
+		try:
+			columns = row.findAll("td")
+			ip_addr, port = [i.get_text() for i in columns[0:2]]
+			port = port.strip()
+			# Well they thought that they kinda obfuscated that.
+			# Unfortunately their obfuscation is crap and so this bypasses
+			# it.
+			ip_addr = ip_addr[25:30] + ip_addr[45:-5]
+			yield ip_addr, port
+		except Exception as e:
+			break
+	logger.info("retrieved {} proxies".format(i))
+			
+		
+def proxy_is_in_db(url, connection):
+	cursor = connection.cursor()
+	cursor.execute("SELECT proxy FROM proxies WHERE proxy = ?", (url,))
+	return cursor.fetchall() != []
+
+
+
--- a/exam/ex01/receive.py
+++ b/exam/ex01/receive.py
@@ -0,0 +1,49 @@
+import logging
+from url import construct_url
+from proxy import get_data_with_proxy
+
+logger = logging.getLogger(__name__)
+
+def _receive_links(title, connection):
+	url = construct_url(title)
+
+	result = get_data_with_proxy(url, connection)
+	# This is basically because we don't know the page ID.
+	for k, page_data in result["query"]["pages"].items():
+		cursor = connection.cursor()
+		if(not "links" in page_data):
+			destination_title = page_data["title"].replace(" ", "_")
+			# avoid 1-loops
+			if(destination_title == title):
+				pass
+			cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
+			yield destination_title
+
+		else:
+			for destination in page_data["links"]:
+				destination_title = destination["title"].replace(" ", "_")
+				cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
+				yield destination_title
+		connection.commit()
+
+def receive_links(title, connection):
+	return list(_receive_links(title, connection))
+
+
+def receive_link_graph(title, connection, depth):
+	if(depth < 0):
+		# end of recursion
+		return
+
+	cursor = connection.cursor()
+	cursor.execute("SELECT COUNT(source) FROM links WHERE source=?", (title,))
+	if(cursor.fetchone()[0] != 0):
+		# we fetched that title already
+		return
+
+	logger.info("fetching links for {}".format(title))
+
+	for link in _receive_links(title, connection):
+		receive_link_graph(link, connection, depth - 1)
+
+	
--- a/exam/ex01/url.py
+++ b/exam/ex01/url.py
@@ -0,0 +1,3 @@
+
+def construct_url(title):
+	return "https://en.wikipedia.org/w/api.php?action=query&prop=links&pllimit=500&titles={}&format=json".format(title)