initial stuff for ex01
This commit is contained in:
parent
36f4ee2501
commit
783ec462e3
28
exam/ex01/cache.py
Normal file
28
exam/ex01/cache.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
import os
|
||||
import sqlite3
|
||||
|
||||
from proxy import fetch_proxies
|
||||
|
||||
def get_cache(directory):
|
||||
cache_file = os.path.join(directory, "cache.sqlite")
|
||||
if(not os.path.exists(cache_file)):
|
||||
with open(cache_file, "w") as fin:
|
||||
pass
|
||||
db = sqlite3.connect(cache_file)
|
||||
|
||||
cursor = db.cursor()
|
||||
|
||||
cursor.execute("CREATE TABLE proxies(proxy TEXT, lasttime_could_not_be_used DECIMAL)")
|
||||
cursor.execute("CREATE TABLE links(source TEXT, destination TEXT)")
|
||||
cursor.execute("CREATE TABLE dijkstra_helper(name TEXT, value INT)")
|
||||
|
||||
db.commit()
|
||||
db = sqlite3.connect(cache_file)
|
||||
fetch_proxies(db)
|
||||
return db
|
||||
|
||||
def clear_cache_data(connection):
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("DELETE FROM links")
|
||||
cursor.execute("DELETE FROM dijkstra_helper")
|
||||
connection.commit()
|
9
exam/ex01/main.py
Normal file
9
exam/ex01/main.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
from cache import get_cache
|
||||
from receive import receive_links, receive_link_graph
|
||||
|
||||
cache = get_cache("./cache/")
|
||||
receive_link_graph("Angela_Merkel", cache, 3)
|
88
exam/ex01/proxy.py
Normal file
88
exam/ex01/proxy.py
Normal file
|
@ -0,0 +1,88 @@
|
|||
"""
|
||||
Module to fetch new proxies from
|
||||
https://www.proxynova.com/proxy-server-list/country-de/
|
||||
"""
|
||||
|
||||
import requests
|
||||
import logging
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_data_with_proxy(url, conn_object, visit_first=None):
|
||||
cursor = conn_object.cursor()
|
||||
# Assume that table name is proxies
|
||||
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
|
||||
headers = {}
|
||||
for i, lasttime_could_not_be_used in cursor:
|
||||
session = requests.Session()
|
||||
session.proxies = { 'http': i}
|
||||
try:
|
||||
response = session.get(url, headers=headers, timeout=3)
|
||||
except:
|
||||
# If proxy is invalid/inactive, update lasttime could not be used and go next proxy
|
||||
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
|
||||
(time.time(), i))
|
||||
continue
|
||||
# If text is empty, update lasttime could not be used and go next proxy
|
||||
if not response.text or 399 < response.status_code < 600:
|
||||
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
|
||||
(time.time(), i))
|
||||
continue
|
||||
return response.json()
|
||||
raise Exception("No more proxies left")
|
||||
|
||||
|
||||
def fetch_proxies(connection):
|
||||
"""
|
||||
Fetch new proxies from https://us-proxy.org/ and put them
|
||||
in the database ``connection``.
|
||||
"""
|
||||
resp = requests.request(url="https://www.proxynova.com/proxy-server-list/country-de/", method="GET")
|
||||
logger.info("request status code: {}; elapsed us: {}".format(resp.status_code,
|
||||
resp.elapsed.microseconds))
|
||||
if(resp.status_code != 200):
|
||||
logger.error("status code is not 200")
|
||||
raise Exception("failed to retrieve proxy list")
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
cursor = connection.cursor()
|
||||
for i, (ip_addr, port) in enumerate(_get_rows(soup)):
|
||||
|
||||
url = "http://{}:{}".format(ip_addr, port)
|
||||
|
||||
if(not proxy_is_in_db(url, connection)):
|
||||
cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,))
|
||||
logging.info("added {} new proxies".format(i))
|
||||
connection.commit()
|
||||
|
||||
|
||||
|
||||
|
||||
def _get_rows(soup):
|
||||
for i, row in enumerate(soup.findAll("tr")):
|
||||
# first row is a header
|
||||
if(i == 0):
|
||||
continue
|
||||
try:
|
||||
columns = row.findAll("td")
|
||||
ip_addr, port = [i.get_text() for i in columns[0:2]]
|
||||
port = port.strip()
|
||||
# Well they thought that they kinda obfuscated that.
|
||||
# Unfortunately their obfuscation is crap and so this bypasses
|
||||
# it.
|
||||
ip_addr = ip_addr[25:30] + ip_addr[45:-5]
|
||||
yield ip_addr, port
|
||||
except Exception as e:
|
||||
break
|
||||
logger.info("retrieved {} proxies".format(i))
|
||||
|
||||
|
||||
def proxy_is_in_db(url, connection):
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("SELECT proxy FROM proxies WHERE proxy = ?", (url,))
|
||||
return cursor.fetchall() != []
|
||||
|
||||
|
||||
|
49
exam/ex01/receive.py
Normal file
49
exam/ex01/receive.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
import logging
|
||||
from url import construct_url
|
||||
from proxy import get_data_with_proxy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def _receive_links(title, connection):
|
||||
url = construct_url(title)
|
||||
|
||||
result = get_data_with_proxy(url, connection)
|
||||
# This is basically because we don't know the page ID.
|
||||
for k, page_data in result["query"]["pages"].items():
|
||||
cursor = connection.cursor()
|
||||
if(not "links" in page_data):
|
||||
destination_title = page_data["title"].replace(" ", "_")
|
||||
# avoid 1-loops
|
||||
if(destination_title == title):
|
||||
pass
|
||||
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
|
||||
yield destination_title
|
||||
|
||||
else:
|
||||
for destination in page_data["links"]:
|
||||
destination_title = destination["title"].replace(" ", "_")
|
||||
cursor.execute("INSERT INTO links(source, destination) VALUES(?, ?)", (title, destination_title))
|
||||
yield destination_title
|
||||
connection.commit()
|
||||
|
||||
def receive_links(title, connection):
|
||||
return list(_receive_links(title, connection))
|
||||
|
||||
|
||||
def receive_link_graph(title, connection, depth):
|
||||
if(depth < 0):
|
||||
# end of recursion
|
||||
return
|
||||
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("SELECT COUNT(source) FROM links WHERE source=?", (title,))
|
||||
if(cursor.fetchone()[0] != 0):
|
||||
# we fetched that title already
|
||||
return
|
||||
|
||||
logger.info("fetching links for {}".format(title))
|
||||
|
||||
for link in _receive_links(title, connection):
|
||||
receive_link_graph(link, connection, depth - 1)
|
||||
|
||||
|
3
exam/ex01/url.py
Normal file
3
exam/ex01/url.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
|
||||
def construct_url(title):
|
||||
return "https://en.wikipedia.org/w/api.php?action=query&prop=links&pllimit=500&titles={}&format=json".format(title)
|
Loading…
Reference in New Issue
Block a user