From 5d5d7afed541257dc8bf490b9f324f14cead3d05 Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Sat, 6 Apr 2024 13:50:58 +0200 Subject: [PATCH] support invalid url --- db/mongo_manager.py | 7 ++++++- proxy_manager/proxy_manager.py | 6 ++++-- queue_message/appointmentrequestsender.py | 1 + workers/link_validator.py | 25 ++++++++++++++++++++--- 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/db/mongo_manager.py b/db/mongo_manager.py index dc19c6e..37b1302 100755 --- a/db/mongo_manager.py +++ b/db/mongo_manager.py @@ -83,10 +83,13 @@ class MongoDbManager: self.logger.info(error) return link_list - def link_validated_for_result(self, link: str, linkPojo: LinkPojo, state=True, is_duplicated=False): + def link_validated_for_result(self, link: str, linkPojo: LinkPojo, state=True, is_duplicated=False, + is_invalid=False): print("link_validated_for_result() called with url = " + link) if is_duplicated: _id = link.split("/")[-2] + elif is_invalid: + _id = linkPojo.url.split("/")[-2] else: _id = link.split("/")[-1] print("link_validated_for_result() called with id = " + _id) @@ -97,6 +100,8 @@ class MongoDbManager: collection = self.db[collection_name] validated_at = time.strftime("%H:%M:%S", time.localtime()) validated_by = "requests" + if is_invalid: + validated_by = "Invalid" if is_duplicated: validated_by = "Double" collection.find_one_and_update({'_id': _id}, { diff --git a/proxy_manager/proxy_manager.py b/proxy_manager/proxy_manager.py index 2c514cf..b3fa51a 100644 --- a/proxy_manager/proxy_manager.py +++ b/proxy_manager/proxy_manager.py @@ -22,6 +22,8 @@ FR_PROXY_ASOCK_RES_2 = { class ProxyManager: def get_link_validate_proxy(self, links_to_validate: list) -> list: if len(links_to_validate) > 15: - return [FR_PROXY_RES_OXY, FR_PROXY_ASOCK_RES_2, FR_ASOCKS_MOBILE_PROXY] + # return [FR_PROXY_RES_OXY, FR_PROXY_ASOCK_RES_2, FR_ASOCKS_MOBILE_PROXY] + return [FR_PROXY_RES_OXY] else: - return [FR_PROXY_RES_OXY, FR_PROXY_ASOCK_RES_2, FR_ASOCKS_MOBILE_PROXY] + # return [FR_PROXY_RES_OXY, FR_PROXY_ASOCK_RES_2, FR_ASOCKS_MOBILE_PROXY] + return [FR_PROXY_RES_OXY] diff --git a/queue_message/appointmentrequestsender.py b/queue_message/appointmentrequestsender.py index 78ef91b..b905de3 100644 --- a/queue_message/appointmentrequestsender.py +++ b/queue_message/appointmentrequestsender.py @@ -18,6 +18,7 @@ from workers.sender import Sender QUEUE_HOST = "appointment.lpaconsulting.fr" REQUEST_DATA_QUEUE = 'REQUEST_DATA' +REQUEST_DATA_DE = 'REQUEST_DATA_DE' credentials = pika.PlainCredentials('appointment', 'ZyuhJZ2xEYWhElhpJjy7YEpZGZwNYJz2fHIu') diff --git a/workers/link_validator.py b/workers/link_validator.py index 7af0243..bf313dd 100644 --- a/workers/link_validator.py +++ b/workers/link_validator.py @@ -11,10 +11,11 @@ from models.LinkPojo import LinkPojo from models.result_pojo import RequestResult from proxy_manager.proxy_manager import ProxyManager from queue_message.CookiesPublisher import CookiesPublisher, REQUEST_DATA_QUEUE_TEST -from queue_message.appointmentrequestsender import QUEUE_HOST, REQUEST_DATA_QUEUE, credentials +from queue_message.appointmentrequestsender import QUEUE_HOST, REQUEST_DATA_QUEUE, credentials, REQUEST_DATA_DE from workers.proxies_constants import PROXY_LIST_FR DOUBLE_MESSAGE = "Une demande de rendez-vous a déjà été enregistrée avec ces coordonnées" +INVALID = "Depuis plus de 130 ans," class LinkValidator(threading.Thread): @@ -63,6 +64,7 @@ class LinkValidator(threading.Thread): _proxy_to_use = random.choice(self.proxy_manager.get_link_validate_proxy(self.link_to_validate_list)) print(_proxy_to_use) print("received cookie is " + str(self.cookie_str)) + print("send request for link: " + linkPojo.url) try: response = requests.get(url=linkPojo.url, headers=headers, verify=False, proxies=_proxy_to_use, timeout=30) @@ -85,6 +87,22 @@ class LinkValidator(threading.Thread): self.cookiesPublisher.publish_body(new_coolies_str) self.cookie_str = new_coolies_str return RequestResult.SUCCESS + elif INVALID in _content: + + MONGO_STORE_MANAGER.link_validated_for_result(response.url, linkPojo, is_invalid=True) + # set new cookies + _cookies_to_set = response.headers['set-cookie'] + self.cookie.load(_cookies_to_set) + new_cookies = {k: v.value for k, v in self.cookie.items()} + new_coolies_str = "" + for key in new_cookies: + new_coolies_str = new_coolies_str + key + "=" + new_cookies[key] + ";" + print("will publish to queue {}".format(new_coolies_str)) + # upload the cookie to queue + self.cookiesPublisher.publish_body(new_coolies_str) + self.cookie_str = new_coolies_str + return RequestResult.SUCCESS + elif DOUBLE_MESSAGE in _content: print(response.url) MONGO_STORE_MANAGER.link_validated_for_result(response.url, linkPojo, is_duplicated=True) @@ -171,14 +189,15 @@ class LinkValidator(threading.Thread): def validate_with_FR_ip(segment_position=1): - _queue_name = REQUEST_DATA_QUEUE + # _queue_name = REQUEST_DATA_QUEUE + _queue_name = REQUEST_DATA_DE cookiesPublisher = CookiesPublisher(queue_name=_queue_name) cookiesPublisher.set_up_connection() print("filter links with ip_country") _proxy_manager = ProxyManager() receiver = LinkValidator(cookiesPublisher=cookiesPublisher, proxy_manager=_proxy_manager, - queue_to_listen=_queue_name, ip_country="FR", segment_position=segment_position, limit=50) + queue_to_listen=_queue_name, ip_country="FR", segment_position=segment_position, limit=0) print("will connect to queue") receiver.set_up_connection() receiver.listen_to_queue(receiver.on_message)