100 lines
2.9 KiB
Python
100 lines
2.9 KiB
Python
"""
|
|
Module to fetch new proxies from
|
|
https://www.proxynova.com/proxy-server-list/country-de/
|
|
"""
|
|
|
|
import requests
|
|
import logging
|
|
import time
|
|
from bs4 import BeautifulSoup
|
|
|
|
from cfg import config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class NoMoreProxiesException(Exception):
|
|
pass
|
|
|
|
def get_data_with_proxy(url, conn_object, visit_first=None):
|
|
cursor = conn_object.cursor()
|
|
# Assume that table name is proxies
|
|
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
|
|
headers = {}
|
|
for i, lasttime_could_not_be_used in cursor:
|
|
session = requests.Session()
|
|
session.proxies = { 'http': i}
|
|
try:
|
|
response = session.get(url, headers=headers, timeout=3)
|
|
except Exception as e:
|
|
if(isinstance(e, KeyboardInterrupt)):
|
|
raise e
|
|
# If proxy is invalid/inactive, update lasttime could not be used and go next proxy
|
|
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %s WHERE proxy = %s ''',
|
|
(time.time(), i))
|
|
continue
|
|
# If text is empty, update lasttime could not be used and go next proxy
|
|
if not response.text or 399 < response.status_code < 600:
|
|
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = %s WHERE proxy = %s ''',
|
|
(time.time(), i))
|
|
continue
|
|
# Be nice to Wikipedia.
|
|
time.sleep(0.3)
|
|
return response.json()
|
|
raise NoMoreProxiesException("No more proxies left")
|
|
|
|
|
|
def fetch_proxies(connection):
|
|
"""
|
|
Fetch new proxies from https://us-proxy.org/ and put them
|
|
in the database ``connection``.
|
|
"""
|
|
resp = requests.request(url="https://www.proxynova.com/proxy-server-list/country-de/", method="GET")
|
|
logger.info("request status code: {}; elapsed us: {}".format(resp.status_code,
|
|
resp.elapsed.microseconds))
|
|
if(resp.status_code != 200):
|
|
logger.error("status code is not 200")
|
|
raise Exception("failed to retrieve proxy list")
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
cursor = connection.cursor()
|
|
cnt = 0
|
|
for ip_addr, port in _get_rows(soup):
|
|
|
|
url = "http://{}:{}".format(ip_addr, port)
|
|
|
|
if(not proxy_is_in_db(url, connection)):
|
|
cursor.execute("INSERT INTO proxies VALUES(%s, 0)", (url,))
|
|
cnt += 1
|
|
logging.info("added {} new proxies".format(cnt))
|
|
connection.commit()
|
|
|
|
|
|
|
|
|
|
def _get_rows(soup):
|
|
for i, row in enumerate(soup.findAll("tr")):
|
|
# first row is a header
|
|
if(i == 0):
|
|
continue
|
|
try:
|
|
columns = row.findAll("td")
|
|
ip_addr, port = [i.get_text() for i in columns[0:2]]
|
|
port = port.strip()
|
|
# Well they thought that they kinda obfuscated that.
|
|
# Unfortunately their obfuscation is crap and so this bypasses
|
|
# it.
|
|
ip_addr = ip_addr[25:30] + ip_addr[45:-5]
|
|
yield ip_addr, port
|
|
except Exception as e:
|
|
continue
|
|
logger.info("retrieved {} proxies".format(i))
|
|
|
|
|
|
def proxy_is_in_db(url, connection):
|
|
cursor = connection.cursor()
|
|
cursor.execute("SELECT proxy FROM proxies WHERE proxy = %s", (url,))
|
|
return bool(cursor.fetchall())
|
|
|
|
|
|
|