I have a scraping script that works fine when I run it from the console. However when I run it as a scheduled task I get proxy connection error. I can't figure why there would be a difference between the two? I'm using the same virtual env.
Here's the error from my log:
2017-03-29 06:02 - ERROR - HTTPConnectionPool(host='nl.proxymesh.com', port=31280): Max retries exceeded with url: http://www.***.dk/emner/038-6740 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7fdede3f0c88>: Failed to establish a new connection: [Errno 111] Connection refused',)))
Traceback (most recent call last):
File "/home/rune/env/lib/python3.5/site-packages/requests/packages/urllib3/connection.py", line 141, in _new_conn
(self.host, self.port), self.timeout, **extra_kw)
File "/home/rune/env/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py", line 83, in create_connection
raise err
File "/home/rune/env/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py", line 73, in create_connection
sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/rune/env/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py", line 600, in urlopen
chunked=chunked)
File "/home/rune/env/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py", line 356, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/usr/lib/python3.5/http/client.py", line 1106, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.5/http/client.py", line 1151, in _send_request
self.endheaders(body)
File "/usr/lib/python3.5/http/client.py", line 1102, in endheaders
self._send_output(message_body)
File "/usr/lib/python3.5/http/client.py", line 934, in _send_output
self.send(msg)
File "/usr/lib/python3.5/http/client.py", line 877, in send
self.connect()
File "/home/rune/env/lib/python3.5/site-packages/requests/packages/urllib3/connection.py", line 166, in connect
conn = self._new_conn()
File "/home/rune/env/lib/python3.5/site-packages/requests/packages/urllib3/connection.py", line 150, in _new_conn
self, "Failed to establish a new connection: %s" % e)
requests.packages.urllib3.exceptions.NewConnectionError: <requests.packages.urllib3.connection.HTTPConnection object at 0x7fdede3f0c88>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/rune/env/lib/python3.5/site-packages/requests/adapters.py", line 423, in send
timeout=timeout
File "/home/rune/env/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py", line 649, in urlopen
_stacktrace=sys.exc_info()[2])
File "/home/rune/env/lib/python3.5/site-packages/requests/packages/urllib3/util/retry.py", line 376, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
requests.packages.urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='nl.proxymesh.com', port=31280): Max retries exceeded with url: http://www.***.dk/emner/038-6740 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7fdede3f0c88>: Failed to establish a new connection: [Errno 111] Connection refused',)))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/rune/Danamica/Scraping/ScrapeStamdataUpdate.py", line 105, in get_html
r = requests.get(url, headers=headers, proxies=proxies)
File "/home/rune/env/lib/python3.5/site-packages/requests/api.py", line 70, in get
return request('get', url, params=params, **kwargs)
File "/home/rune/env/lib/python3.5/site-packages/requests/api.py", line 56, in request
return session.request(method=method, url=url, **kwargs)
File "/home/rune/env/lib/python3.5/site-packages/requests/sessions.py", line 488, in request
resp = self.send(prep, **send_kwargs)
File "/home/rune/env/lib/python3.5/site-packages/requests/sessions.py", line 609, in send
r = adapter.send(request, **kwargs)
File "/home/rune/env/lib/python3.5/site-packages/requests/adapters.py", line 485, in send
raise ProxyError(e, request=request)
requests.exceptions.ProxyError: HTTPConnectionPool(host='nl.proxymesh.com', port=31280): Max retries exceeded with url: http://www.***.dk/emner/038-6740 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7fdede3f0c88>: Failed to establish a new connection: [Errno 111] Connection refused',)))
And here's the part the function in my script where the error happens:
def get_html(self, url, retries=5, sleep_retries=10, sleep_time=900):
try:
ua = UserAgent()
headers = {"User-Agent": ua.random}
proxy_list = [{'http': 'http://email@gmail.com:***@fr.proxymesh.com:31280',
'https': 'http://email@gmail.com:***@fr.proxymesh.com:31280'},
{'http': 'http://email@gmail.com:***@de.proxymesh.com:31280',
'https': 'http://email@gmail.com:***@de.proxymesh.com:31280'},
{'http': 'http://email@gmail.com:***@nl.proxymesh.com:31280',
'https': 'http://email@gmail.com:***@nl.proxymesh.com:31280'},
{'http': 'http://email@gmail.com:***@uk.proxymesh.com:31280',
'https': 'http://email@gmail.com:***@uk.proxymesh.com:31280'},
{'http': 'http://email@gmail.com:***@ch.proxymesh.com:31280',
'https': 'http://email@gmail.com:***@ch.proxymesh.com:31280'}]
proxies = random.choice(proxy_list)
r = requests.get(url, headers=headers, proxies=proxies)
bsObj = BeautifulSoup(r.content, 'lxml')
return bsObj
except Exception as exc: # Starts a loop.
logger.error(exc, exc_info=True)
r = None
if retries > 0:
logger.info("Retrying " + datetime.now().strftime(self.date_time_format))
logger.info("Retries left: " + str(retries))
retries = retries - 1
return retries
return requests.get(url, headers=headers, proxies=proxies)
if retries == 0 and sleep_retries > 0:
logger.info("sleeping")
time.sleep(sleep_time)
logger.info("retrying after sleep" + datetime.now().strftime(self.date_time_format))
logger.info("retries left:" + str(sleep_retries))
sleep_retries = sleep_retries - 1
return self.get_request(url, 0, sleep_retries)