support invalid url

This commit is contained in:
2024-04-06 13:50:58 +02:00
parent 88ff6d539b
commit 5d5d7afed5
4 changed files with 33 additions and 6 deletions
+6 -1
View File
@@ -83,10 +83,13 @@ class MongoDbManager:
self.logger.info(error) self.logger.info(error)
return link_list return link_list
def link_validated_for_result(self, link: str, linkPojo: LinkPojo, state=True, is_duplicated=False): def link_validated_for_result(self, link: str, linkPojo: LinkPojo, state=True, is_duplicated=False,
is_invalid=False):
print("link_validated_for_result() called with url = " + link) print("link_validated_for_result() called with url = " + link)
if is_duplicated: if is_duplicated:
_id = link.split("/")[-2] _id = link.split("/")[-2]
elif is_invalid:
_id = linkPojo.url.split("/")[-2]
else: else:
_id = link.split("/")[-1] _id = link.split("/")[-1]
print("link_validated_for_result() called with id = " + _id) print("link_validated_for_result() called with id = " + _id)
@@ -97,6 +100,8 @@ class MongoDbManager:
collection = self.db[collection_name] collection = self.db[collection_name]
validated_at = time.strftime("%H:%M:%S", time.localtime()) validated_at = time.strftime("%H:%M:%S", time.localtime())
validated_by = "requests" validated_by = "requests"
if is_invalid:
validated_by = "Invalid"
if is_duplicated: if is_duplicated:
validated_by = "Double" validated_by = "Double"
collection.find_one_and_update({'_id': _id}, { collection.find_one_and_update({'_id': _id}, {
+4 -2
View File
@@ -22,6 +22,8 @@ FR_PROXY_ASOCK_RES_2 = {
class ProxyManager: class ProxyManager:
def get_link_validate_proxy(self, links_to_validate: list) -> list: def get_link_validate_proxy(self, links_to_validate: list) -> list:
if len(links_to_validate) > 15: if len(links_to_validate) > 15:
return [FR_PROXY_RES_OXY, FR_PROXY_ASOCK_RES_2, FR_ASOCKS_MOBILE_PROXY] # return [FR_PROXY_RES_OXY, FR_PROXY_ASOCK_RES_2, FR_ASOCKS_MOBILE_PROXY]
return [FR_PROXY_RES_OXY]
else: else:
return [FR_PROXY_RES_OXY, FR_PROXY_ASOCK_RES_2, FR_ASOCKS_MOBILE_PROXY] # return [FR_PROXY_RES_OXY, FR_PROXY_ASOCK_RES_2, FR_ASOCKS_MOBILE_PROXY]
return [FR_PROXY_RES_OXY]
@@ -18,6 +18,7 @@ from workers.sender import Sender
QUEUE_HOST = "appointment.lpaconsulting.fr" QUEUE_HOST = "appointment.lpaconsulting.fr"
REQUEST_DATA_QUEUE = 'REQUEST_DATA' REQUEST_DATA_QUEUE = 'REQUEST_DATA'
REQUEST_DATA_DE = 'REQUEST_DATA_DE'
credentials = pika.PlainCredentials('appointment', 'ZyuhJZ2xEYWhElhpJjy7YEpZGZwNYJz2fHIu') credentials = pika.PlainCredentials('appointment', 'ZyuhJZ2xEYWhElhpJjy7YEpZGZwNYJz2fHIu')
+22 -3
View File
@@ -11,10 +11,11 @@ from models.LinkPojo import LinkPojo
from models.result_pojo import RequestResult from models.result_pojo import RequestResult
from proxy_manager.proxy_manager import ProxyManager from proxy_manager.proxy_manager import ProxyManager
from queue_message.CookiesPublisher import CookiesPublisher, REQUEST_DATA_QUEUE_TEST from queue_message.CookiesPublisher import CookiesPublisher, REQUEST_DATA_QUEUE_TEST
from queue_message.appointmentrequestsender import QUEUE_HOST, REQUEST_DATA_QUEUE, credentials from queue_message.appointmentrequestsender import QUEUE_HOST, REQUEST_DATA_QUEUE, credentials, REQUEST_DATA_DE
from workers.proxies_constants import PROXY_LIST_FR from workers.proxies_constants import PROXY_LIST_FR
DOUBLE_MESSAGE = "Une demande de rendez-vous a déjà été enregistrée avec ces coordonnées" DOUBLE_MESSAGE = "Une demande de rendez-vous a déjà été enregistrée avec ces coordonnées"
INVALID = "Depuis plus de 130 ans,"
class LinkValidator(threading.Thread): class LinkValidator(threading.Thread):
@@ -63,6 +64,7 @@ class LinkValidator(threading.Thread):
_proxy_to_use = random.choice(self.proxy_manager.get_link_validate_proxy(self.link_to_validate_list)) _proxy_to_use = random.choice(self.proxy_manager.get_link_validate_proxy(self.link_to_validate_list))
print(_proxy_to_use) print(_proxy_to_use)
print("received cookie is " + str(self.cookie_str)) print("received cookie is " + str(self.cookie_str))
print("send request for link: " + linkPojo.url)
try: try:
response = requests.get(url=linkPojo.url, headers=headers, verify=False, proxies=_proxy_to_use, response = requests.get(url=linkPojo.url, headers=headers, verify=False, proxies=_proxy_to_use,
timeout=30) timeout=30)
@@ -85,6 +87,22 @@ class LinkValidator(threading.Thread):
self.cookiesPublisher.publish_body(new_coolies_str) self.cookiesPublisher.publish_body(new_coolies_str)
self.cookie_str = new_coolies_str self.cookie_str = new_coolies_str
return RequestResult.SUCCESS return RequestResult.SUCCESS
elif INVALID in _content:
MONGO_STORE_MANAGER.link_validated_for_result(response.url, linkPojo, is_invalid=True)
# set new cookies
_cookies_to_set = response.headers['set-cookie']
self.cookie.load(_cookies_to_set)
new_cookies = {k: v.value for k, v in self.cookie.items()}
new_coolies_str = ""
for key in new_cookies:
new_coolies_str = new_coolies_str + key + "=" + new_cookies[key] + ";"
print("will publish to queue {}".format(new_coolies_str))
# upload the cookie to queue
self.cookiesPublisher.publish_body(new_coolies_str)
self.cookie_str = new_coolies_str
return RequestResult.SUCCESS
elif DOUBLE_MESSAGE in _content: elif DOUBLE_MESSAGE in _content:
print(response.url) print(response.url)
MONGO_STORE_MANAGER.link_validated_for_result(response.url, linkPojo, is_duplicated=True) MONGO_STORE_MANAGER.link_validated_for_result(response.url, linkPojo, is_duplicated=True)
@@ -171,14 +189,15 @@ class LinkValidator(threading.Thread):
def validate_with_FR_ip(segment_position=1): def validate_with_FR_ip(segment_position=1):
_queue_name = REQUEST_DATA_QUEUE # _queue_name = REQUEST_DATA_QUEUE
_queue_name = REQUEST_DATA_DE
cookiesPublisher = CookiesPublisher(queue_name=_queue_name) cookiesPublisher = CookiesPublisher(queue_name=_queue_name)
cookiesPublisher.set_up_connection() cookiesPublisher.set_up_connection()
print("filter links with ip_country") print("filter links with ip_country")
_proxy_manager = ProxyManager() _proxy_manager = ProxyManager()
receiver = LinkValidator(cookiesPublisher=cookiesPublisher, receiver = LinkValidator(cookiesPublisher=cookiesPublisher,
proxy_manager=_proxy_manager, proxy_manager=_proxy_manager,
queue_to_listen=_queue_name, ip_country="FR", segment_position=segment_position, limit=50) queue_to_listen=_queue_name, ip_country="FR", segment_position=segment_position, limit=0)
print("will connect to queue") print("will connect to queue")
receiver.set_up_connection() receiver.set_up_connection()
receiver.listen_to_queue(receiver.on_message) receiver.listen_to_queue(receiver.on_message)