initial stuff for ex01

This commit is contained in:
Daniel Knüttel 2019-02-02 11:18:57 +01:00
parent 36f4ee2501
commit 783ec462e3
5 changed files with 177 additions and 0 deletions

28
exam/ex01/cache.py Normal file
View File

@ -0,0 +1,28 @@
import os
import sqlite3
from proxy import fetch_proxies
def get_cache(directory):
cache_file = os.path.join(directory, "cache.sqlite")
if(not os.path.exists(cache_file)):
with open(cache_file, "w") as fin:
pass
db = sqlite3.connect(cache_file)
cursor = db.cursor()
cursor.execute("CREATE TABLE proxies(proxy TEXT, lasttime_could_not_be_used DECIMAL)")
cursor.execute("CREATE TABLE links(source TEXT, destination TEXT)")
cursor.execute("CREATE TABLE dijkstra_helper(name TEXT, value INT)")
db.commit()
db = sqlite3.connect(cache_file)
fetch_proxies(db)
return db
def clear_cache_data(connection):
cursor = connection.cursor()
cursor.execute("DELETE FROM links")
cursor.execute("DELETE FROM dijkstra_helper")
connection.commit()

9
exam/ex01/main.py Normal file
View File

@ -0,0 +1,9 @@
import logging
logging.basicConfig(level=logging.DEBUG)
from cache import get_cache
from receive import receive_links, receive_link_graph
cache = get_cache("./cache/")
receive_link_graph("Angela_Merkel", cache, 3)

88
exam/ex01/proxy.py Normal file
View File

@ -0,0 +1,88 @@
"""
Module to fetch new proxies from
https://www.proxynova.com/proxy-server-list/country-de/
"""
import requests
import logging
import time
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
def get_data_with_proxy(url, conn_object, visit_first=None):
cursor = conn_object.cursor()
# Assume that table name is proxies
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
headers = {}
for i, lasttime_could_not_be_used in cursor:
session = requests.Session()
session.proxies = { 'http': i}
try:
response = session.get(url, headers=headers, timeout=3)
except:
# If proxy is invalid/inactive, update lasttime could not be used and go next proxy
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
(time.time(), i))
continue
# If text is empty, update lasttime could not be used and go next proxy
if not response.text or 399 < response.status_code < 600:
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
(time.time(), i))
continue
return response.json()
raise Exception("No more proxies left")
def fetch_proxies(connection):
"""
Fetch new proxies from https://us-proxy.org/ and put them
in the database ``connection``.
"""
resp = requests.request(url="https://www.proxynova.com/proxy-server-list/country-de/", method="GET")
logger.info("request status code: {}; elapsed us: {}".format(resp.status_code,
resp.elapsed.microseconds))
if(resp.status_code != 200):
logger.error("status code is not 200")
raise Exception("failed to retrieve proxy list")
soup = BeautifulSoup(resp.text, "html.parser")
cursor = connection.cursor()
for i, (ip_addr, port) in enumerate(_get_rows(soup)):
url = "http://{}:{}".format(ip_addr, port)
if(not proxy_is_in_db(url, connection)):
cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,))
logging.info("added {} new proxies".format(i))
connection.commit()
def _get_rows(soup):
for i, row in enumerate(soup.findAll("tr")):
# first row is a header
if(i == 0):
continue
try:
columns = row.findAll("td")
ip_addr, port = [i.get_text() for i in columns[0:2]]
port = port.strip()
# Well they thought that they kinda obfuscated that.
# Unfortunately their obfuscation is crap and so this bypasses
# it.
ip_addr = ip_addr[25:30] + ip_addr[45:-5]
yield ip_addr, port
except Exception as e:
break
logger.info("retrieved {} proxies".format(i))
def proxy_is_in_db(url, connection):
cursor = connection.cursor()
cursor.execute("SELECT proxy FROM proxies WHERE proxy = ?", (url,))
return cursor.fetchall() != []

49
exam/ex01/receive.py Normal file
View File

@ -0,0 +1,49 @@
import logging
from url import construct_url
from proxy import get_data_with_proxy
logger = logging.getLogger(__name__)
def _receive_links(title, connection):
url = construct_url(title)
result = get_data_with_proxy(url, connection)
# This is basically because we don't know the page ID.
for k, page_data in result["query"]["pages"].items():
cursor = connection.cursor()
if(not "links" in page_data):
destination_title = page_data["title"].replace(" ", "_")
# avoid 1-loops
if(destination_title == title):
pass
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
yield destination_title
else:
for destination in page_data["links"]:
destination_title = destination["title"].replace(" ", "_")
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
yield destination_title
connection.commit()
def receive_links(title, connection):
return list(_receive_links(title, connection))
def receive_link_graph(title, connection, depth):
if(depth < 0):
# end of recursion
return
cursor = connection.cursor()
cursor.execute("SELECT COUNT(source) FROM links WHERE source=?", (title,))
if(cursor.fetchone()[0] != 0):
# we fetched that title already
return
logger.info("fetching links for {}".format(title))
for link in _receive_links(title, connection):
receive_link_graph(link, connection, depth - 1)

3
exam/ex01/url.py Normal file
View File

@ -0,0 +1,3 @@
def construct_url(title):
return "https://en.wikipedia.org/w/api.php?action=query&prop=links&pllimit=500&titles={}&format=json".format(title)