89 lines
2.7 KiB
Python
89 lines
2.7 KiB
Python
|
"""
|
||
|
Module to fetch new proxies from
|
||
|
https://www.proxynova.com/proxy-server-list/country-de/
|
||
|
"""
|
||
|
|
||
|
import requests
|
||
|
import logging
|
||
|
import time
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
def get_data_with_proxy(url, conn_object, visit_first=None):
|
||
|
cursor = conn_object.cursor()
|
||
|
# Assume that table name is proxies
|
||
|
cursor.execute('''SELECT proxy, lasttime_could_not_be_used FROM proxies ORDER BY lasttime_could_not_be_used ASC''')
|
||
|
headers = {}
|
||
|
for i, lasttime_could_not_be_used in cursor:
|
||
|
session = requests.Session()
|
||
|
session.proxies = { 'http': i}
|
||
|
try:
|
||
|
response = session.get(url, headers=headers, timeout=3)
|
||
|
except:
|
||
|
# If proxy is invalid/inactive, update lasttime could not be used and go next proxy
|
||
|
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
|
||
|
(time.time(), i))
|
||
|
continue
|
||
|
# If text is empty, update lasttime could not be used and go next proxy
|
||
|
if not response.text or 399 < response.status_code < 600:
|
||
|
cursor.execute('''UPDATE proxies SET lasttime_could_not_be_used = ? WHERE proxy = ? ''',
|
||
|
(time.time(), i))
|
||
|
continue
|
||
|
return response.json()
|
||
|
raise Exception("No more proxies left")
|
||
|
|
||
|
|
||
|
def fetch_proxies(connection):
|
||
|
"""
|
||
|
Fetch new proxies from https://us-proxy.org/ and put them
|
||
|
in the database ``connection``.
|
||
|
"""
|
||
|
resp = requests.request(url="https://www.proxynova.com/proxy-server-list/country-de/", method="GET")
|
||
|
logger.info("request status code: {}; elapsed us: {}".format(resp.status_code,
|
||
|
resp.elapsed.microseconds))
|
||
|
if(resp.status_code != 200):
|
||
|
logger.error("status code is not 200")
|
||
|
raise Exception("failed to retrieve proxy list")
|
||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||
|
|
||
|
cursor = connection.cursor()
|
||
|
for i, (ip_addr, port) in enumerate(_get_rows(soup)):
|
||
|
|
||
|
url = "http://{}:{}".format(ip_addr, port)
|
||
|
|
||
|
if(not proxy_is_in_db(url, connection)):
|
||
|
cursor.execute("INSERT INTO proxies VALUES(?, 0)", (url,))
|
||
|
logging.info("added {} new proxies".format(i))
|
||
|
connection.commit()
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def _get_rows(soup):
|
||
|
for i, row in enumerate(soup.findAll("tr")):
|
||
|
# first row is a header
|
||
|
if(i == 0):
|
||
|
continue
|
||
|
try:
|
||
|
columns = row.findAll("td")
|
||
|
ip_addr, port = [i.get_text() for i in columns[0:2]]
|
||
|
port = port.strip()
|
||
|
# Well they thought that they kinda obfuscated that.
|
||
|
# Unfortunately their obfuscation is crap and so this bypasses
|
||
|
# it.
|
||
|
ip_addr = ip_addr[25:30] + ip_addr[45:-5]
|
||
|
yield ip_addr, port
|
||
|
except Exception as e:
|
||
|
break
|
||
|
logger.info("retrieved {} proxies".format(i))
|
||
|
|
||
|
|
||
|
def proxy_is_in_db(url, connection):
|
||
|
cursor = connection.cursor()
|
||
|
cursor.execute("SELECT proxy FROM proxies WHERE proxy = ?", (url,))
|
||
|
return cursor.fetchall() != []
|
||
|
|
||
|
|
||
|
|