""" Module to fetch new proxies from https://www.proxynova.com/proxy-server-list/country-de/ """ import requests import logging import time from bs4 import BeautifulSoup from cfg import config logger = logging.getLogger(__name__) class NoMoreProxiesException(Exception): pass def get_data_with_proxy(url, conn_object, visit_first=None): cursor = conn_object.cursor() # Assume that table name is proxies cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''') headers = {} for i, lasttime_could_not_be_used in cursor: session = requests.Session() session.proxies = { 'http': i} try: response = session.get(url, headers=headers, timeout=1) except Exception as e: if(isinstance(e, KeyboardInterrupt)): raise e # If proxy is invalid/inactive, update lasttime could not be used and go next proxy cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %s WHERE proxy = %s ''', (time.time(), i)) continue # If text is empty, update lasttime could not be used and go next proxy if not response.text or 399 < response.status_code < 600: cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %s WHERE proxy = %s ''', (time.time(), i)) continue # Be nice to Wikipedia. time.sleep(0.1) return response.json() raise NoMoreProxiesException("No more proxies left") def fetch_proxies(connection): """ Fetch new proxies from https://us-proxy.org/ and put them in the database ``connection``. """ resp = requests.request(url="https://www.proxynova.com/proxy-server-list/country-de/", method="GET") logger.info("request status code: {}; elapsed us: {}".format(resp.status_code, resp.elapsed.microseconds)) if(resp.status_code != 200): logger.error("status code is not 200") raise Exception("failed to retrieve proxy list") soup = BeautifulSoup(resp.text, "html.parser") cursor = connection.cursor() cnt = 0 for ip_addr, port in _get_rows(soup): url = "http://{}:{}".format(ip_addr, port) if(not proxy_is_in_db(url, connection)): cursor.execute("INSERT INTO proxies VALUES(%s, 0)", (url,)) cnt += 1 logging.info("added {} new proxies".format(cnt)) connection.commit() def _get_rows(soup): for i, row in enumerate(soup.findAll("tr")): # first row is a header if(i == 0): continue try: columns = row.findAll("td") ip_addr, port = [i.get_text() for i in columns[0:2]] port = port.strip() # Well they thought that they kinda obfuscated that. # Unfortunately their obfuscation is crap and so this bypasses # it. ip_addr = ip_addr[25:30] + ip_addr[45:-5] yield ip_addr, port except Exception as e: continue logger.info("retrieved {} proxies".format(i)) def proxy_is_in_db(url, connection): cursor = connection.cursor() cursor.execute("SELECT proxy FROM proxies WHERE proxy = %s", (url,)) return bool(cursor.fetchall())