scientific-programming-exer.../exam/ex01/proxy.py

100 lines
2.9 KiB
Python
Raw Permalink Normal View History

2019-02-02 10:18:57 +00:00
"""
Module to fetch new proxies from
https://www.proxynova.com/proxy-server-list/country-de/
"""
import requests
import logging
import time
from bs4 import BeautifulSoup
2019-02-15 10:47:50 +00:00
from cfg import config
2019-02-02 10:18:57 +00:00
logger = logging.getLogger(__name__)
2019-02-02 15:06:57 +00:00
class NoMoreProxiesException(Exception):
pass
2019-02-02 10:18:57 +00:00
def get_data_with_proxy(url, conn_object, visit_first=None):
cursor = conn_object.cursor()
# Assume that table name is proxies
2019-02-15 11:46:32 +00:00
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
2019-02-02 10:18:57 +00:00
headers = {}
for i, lasttime_could_not_be_used in cursor:
session = requests.Session()
session.proxies = { 'http': i}
try:
response = session.get(url, headers=headers, timeout=3)
2019-02-19 13:16:22 +00:00
except Exception as e:
if(isinstance(e, KeyboardInterrupt)):
raise e
2019-02-02 10:18:57 +00:00
# If proxy is invalid/inactive, update lasttime could not be used and go next proxy
2019-02-19 13:16:22 +00:00
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %s WHERE proxy = %s ''',
2019-02-02 10:18:57 +00:00
(time.time(), i))
continue
# If text is empty, update lasttime could not be used and go next proxy
if not response.text or 399 < response.status_code < 600:
2019-02-19 13:16:22 +00:00
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %s WHERE proxy = %s ''',
2019-02-02 10:18:57 +00:00
(time.time(), i))
continue
2019-02-19 13:16:22 +00:00
# Be nice to Wikipedia.
2019-02-21 16:14:17 +00:00
time.sleep(0.1)
2019-02-02 10:18:57 +00:00
return response.json()
2019-02-02 15:06:57 +00:00
raise NoMoreProxiesException("No more proxies left")
2019-02-02 10:18:57 +00:00
def fetch_proxies(connection):
"""
Fetch new proxies from https://us-proxy.org/ and put them
in the database ``connection``.
"""
resp = requests.request(url="https://www.proxynova.com/proxy-server-list/country-de/", method="GET")
logger.info("request status code: {}; elapsed us: {}".format(resp.status_code,
resp.elapsed.microseconds))
if(resp.status_code != 200):
logger.error("status code is not 200")
raise Exception("failed to retrieve proxy list")
soup = BeautifulSoup(resp.text, "html.parser")
cursor = connection.cursor()
2019-02-02 15:06:57 +00:00
cnt = 0
for ip_addr, port in _get_rows(soup):
2019-02-02 10:18:57 +00:00
url = "http://{}:{}".format(ip_addr, port)
if(not proxy_is_in_db(url, connection)):
2019-02-15 11:46:32 +00:00
cursor.execute("INSERT INTO proxies VALUES(%s, 0)", (url,))
2019-02-02 15:06:57 +00:00
cnt += 1
logging.info("added {} new proxies".format(cnt))
2019-02-02 10:18:57 +00:00
connection.commit()
def _get_rows(soup):
for i, row in enumerate(soup.findAll("tr")):
# first row is a header
if(i == 0):
continue
try:
columns = row.findAll("td")
ip_addr, port = [i.get_text() for i in columns[0:2]]
port = port.strip()
# Well they thought that they kinda obfuscated that.
# Unfortunately their obfuscation is crap and so this bypasses
# it.
ip_addr = ip_addr[25:30] + ip_addr[45:-5]
yield ip_addr, port
except Exception as e:
2019-02-02 15:44:51 +00:00
continue
2019-02-02 10:18:57 +00:00
logger.info("retrieved {} proxies".format(i))
def proxy_is_in_db(url, connection):
cursor = connection.cursor()
2019-02-15 11:46:32 +00:00
cursor.execute("SELECT proxy FROM proxies WHERE proxy = %s", (url,))
return bool(cursor.fetchall())
2019-02-02 10:18:57 +00:00