scientific-programming-exer.../exam/ex01/proxy.py

100 lines
2.8 KiB
Python

"""
Module to fetch new proxies from
https://www.proxynova.com/proxy-server-list/country-de/
"""
import requests
import logging
import time
from bs4 import BeautifulSoup
from cfg import config
import sql
logger = logging.getLogger(__name__)
class NoMoreProxiesException(Exception):
pass
def get_data_with_proxy(url, conn_object, visit_first=None):
cursor = conn_object.cursor()
update_cursor = conn_object.cursor()
# Assume that table name is proxies
cursor.execute(sql.statements["get_proxies"])
headers = {}
for i, lasttime_could_not_be_used in cursor:
session = requests.Session()
session.proxies = { 'http': i}
try:
response = session.get(url, headers=headers, timeout=1)
except Exception as e:
if(isinstance(e, KeyboardInterrupt)):
raise e
# If proxy is invalid/inactive, update lasttime could not be used and go next proxy
update_cursor.execute(sql.statements["update_proxies"], (time.time(), i))
continue
# If text is empty, update lasttime could not be used and go next proxy
if not response.text or 399 < response.status_code < 600:
update_cursor.execute(sql.statements["update_proxies"], (time.time(), i))
continue
# Be nice to Wikipedia.
time.sleep(0.1)
return response.json()
raise NoMoreProxiesException("No more proxies left")
def fetch_proxies(connection):
"""
Fetch new proxies from https://us-proxy.org/ and put them
in the database ``connection``.
"""
resp = requests.request(url="https://www.proxynova.com/proxy-server-list/country-de/", method="GET")
logger.info("request status code: {}; elapsed us: {}".format(resp.status_code,
resp.elapsed.microseconds))
if(resp.status_code != 200):
logger.error("status code is not 200")
raise Exception("failed to retrieve proxy list")
soup = BeautifulSoup(resp.text, "html.parser")
cursor = connection.cursor()
cnt = 0
for ip_addr, port in _get_rows(soup):
url = "http://{}:{}".format(ip_addr, port)
if(not proxy_is_in_db(url, connection)):
cursor.execute(sql.statements["insert_proxy"], (url,))
cnt += 1
logging.info("added {} new proxies".format(cnt))
connection.commit()
def _get_rows(soup):
for i, row in enumerate(soup.findAll("tr")):
# first row is a header
if(i == 0):
continue
try:
columns = row.findAll("td")
ip_addr, port = [i.get_text() for i in columns[0:2]]
port = port.strip()
# Well they thought that they kinda obfuscated that.
# Unfortunately their obfuscation is crap and so this bypasses
# it.
ip_addr = ip_addr[25:30] + ip_addr[45:-5]
yield ip_addr, port
except Exception as e:
continue
logger.info("retrieved {} proxies".format(i))
def proxy_is_in_db(url, connection):
cursor = connection.cursor()
cursor.execute(sql.statements["proxy_in_db"], (url,))
return bool(cursor.fetchall())