From ba99e54ea7a5fbad1000d68983e6d27cb833f30c Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Fri, 2 Sep 2022 14:18:06 +0200 Subject: [PATCH] webkit is blocked --- requirements.txt | 1 + src/mail/mail_reader.py | 2 +- src/workers/GeoCaptchSolver.py | 75 ++++++++++++++++++++++++++ src/workers/commandor_page.py | 99 +++++++++++++++------------------- 4 files changed, 121 insertions(+), 56 deletions(-) create mode 100644 src/workers/GeoCaptchSolver.py diff --git a/requirements.txt b/requirements.txt index 6b508cc..872d413 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ firebase_admin==5.2.0 pandas~=1.3.5 playwright==1.25.0 dataclasses~=0.6 +SpeechRecognition==3.8.1 pymongo==4.1.1 anticaptchaofficial==1.0.43 oci~=2.54.1 diff --git a/src/mail/mail_reader.py b/src/mail/mail_reader.py index 3f40c22..5e56979 100644 --- a/src/mail/mail_reader.py +++ b/src/mail/mail_reader.py @@ -163,7 +163,7 @@ def read_mails(): if need_to_valid_url(url, successful_items): url_validator = LinkValidator(url) print("need to validate url: " + url) - executor.submit(url_validator.start_page, params.get_proxy(ProxyType.BRIGHT_DATA), True) + executor.submit(url_validator.start_page, params.get_proxy(ProxyType.BRIGHT_DATA), False) else: print("do not need to click url --> {}".format(mail.mail_address)) diff --git a/src/workers/GeoCaptchSolver.py b/src/workers/GeoCaptchSolver.py new file mode 100644 index 0000000..920dc3d --- /dev/null +++ b/src/workers/GeoCaptchSolver.py @@ -0,0 +1,75 @@ +import re + +import requests +import speech_recognition as sr +import wget + +WAV_FILE_REGEX = "https:[\/a-z0-9.-]+.wav" + +number_text_dict = {'eight': 8, 'zero': 0, 'one': 1, 'to': 2, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, + 'seven': 7, 'nine': 9} + + +class GeoCaptchaSolver: + + def __init__(self, page): + self.page = page + + def solve(self): + print("solve() called.") + print("try to find iframe") + iframe = self.page.query_selector('iframe').content_frame() + self.iframe = iframe + print(type(iframe)) + print("url is " + iframe.url) + r = requests.get(iframe.url) + body = r.text + # print(body) + match = re.search(WAV_FILE_REGEX, body) + if match: + wav_url = match.group(0) + print("wav file " + wav_url) + response = requests.get(wav_url) + file_name = wav_url.split("/")[-1] + wget.download(wav_url, file_name) + open(file_name, "wb").write(response.content) + r = sr.Recognizer() + with sr.AudioFile(file_name) as source: + audio = r.record(source) # read the entire audio file + # recognize speech using Sphinx + try: + text = r.recognize_sphinx(audio) + number_list = text.split(" ")[-6:] + print(number_list) + number_to_type = [] + for number in number_list: + number_to_type.append(number_text_dict[number]) + print(number_to_type) + if len(number_to_type) == 6: + self.input_numbers(number_to_type) + print("Sphinx thinks you said " + text) + except sr.UnknownValueError: + print("Sphinx could not understand audio") + except sr.RequestError as e: + print("Sphinx error; {0}".format(e)) + else: + print("wav file not found") + + def input_numbers(self, number_to_type): + # use javascript to input + + self.input_by_js(number_to_type) + # input_elements = self.page.query_selector('.audio-captcha-inputs') + # print(input_elements) + + # download file + + def input_by_js(self, number_to_type): + index = 0 + for number in number_to_type: + index = index + 1 + (self.iframe.page).evaluate("""(info) =>{ + let input_fileds = document.getElementsByClassName("audio-captcha-inputs"); + console.log(input_fileds) + input_fileds[info.index].value = info.number;} + """, {'index': index, 'number': number}) diff --git a/src/workers/commandor_page.py b/src/workers/commandor_page.py index b8ba279..071437b 100644 --- a/src/workers/commandor_page.py +++ b/src/workers/commandor_page.py @@ -8,8 +8,6 @@ import time import traceback from typing import Union -from anticaptchaofficial.antigatetask import antigateTask -from playwright._impl._api_structures import SetCookieParam from playwright_stealth import stealth_sync from src import params, definitions @@ -18,6 +16,7 @@ from src.pojo.ModeEnum import ModeEnum from src.pojo.ReserveResultPojo import ReserveResultPojo, PublishType from src.pojo.contact_pojo import ContactPojo from src.proxy.proxy_type import ProxyType +from src.workers.GeoCaptchSolver import GeoCaptchaSolver from src.workers.SolveCaptch import SolveCaptcha from src.workers.TlsPlaywright import TlsPlaywright @@ -30,6 +29,7 @@ REGEX_RDV_URL = "https:\/\/rendezvousparis\.hermes\.com\/client\/register\/[A-Z0 otp_value = None OTP_FIELD_ID = "#sms_code" MESSAGE_FIELD_CLASS = ".message" +audio_button = "captcha__audio__button" BLANK_URL = "about:blank" CONFIRMED_MESSAGE = "Your request for a Leather Goods appointment has been registered" CONFIRMED_MESSAGE_FR = "Votre demande de rendez-vous Maroquinerie a bien été enregistrée et nous vous en remercions." @@ -124,40 +124,17 @@ class CommandorPage: def start_browser(self, proxy, pwright, device) -> Union[str, None]: try: - self.browser = pwright.webkit.launch(headless=self.headless, timeout=PAGE_TIMEOUT, proxy=proxy) + self.browser = pwright.webkit.launch(headless=self.headless, timeout=PAGE_TIMEOUT) + # self.browser = pwright.webkit.launch(headless=self.headless, timeout=PAGE_TIMEOUT, proxy=proxy) # userAgent = random.choice(params.firefox_user_agent_list) simulated_mobile = pwright.devices[device] userAgent = simulated_mobile['user_agent'] print("user_agent is " + userAgent) # context = self.browser.new_context(**simulated_mobile, locale='fr-FR') - context = self.browser.new_context(**simulated_mobile, locale='fr-FR') + context = self.browser.new_context(**simulated_mobile) self.current_context = context self.create_and_config_page(context) return self.page.content() - # self.browser = pwright.webkit.launch(headless=self.headless, timeout=PAGE_TIMEOUT, proxy=proxy) - # self.logger.info("模拟设备: " + device) - # simulated_mobile = pwright.devices[device] - # context = self.browser.new_context(**simulated_mobile, locale='fr-FR') - # self.page = context.new_page() - # # hide webdriver information - # self.page.add_init_script("""() => { - # Object.defineProperty(navigator,'webdriver',{get: () => undefined}); - # Object.defineProperty(navigator, 'platform', { - # get: () => { - # return "iPhone"; - # }}); - # } - # """) - # self.page.on("load", self._on_page_loaded) - # self.page.on("response", self.handle_response) - # self.page.goto(RDV_URL, timeout=PAGE_TIMEOUT) - # captcha_url = "geo.captcha-delivery.com/captcha" - # if captcha_url in self.page.content(): - # self.logger.info("will close browser") - # self.browser.close() - # return None - # else: - # return self.page.content() except Exception as error: params.oracle_log_sender.send_error(str(error)) traceback.print_exc(*sys.exc_info()) @@ -194,31 +171,36 @@ class CommandorPage: def solve_datadome_captcha(self): print("solve_datadome_captcha") - solver = antigateTask() - solver.set_verbose(1) - solver.set_key("ede6a69396fc961af351e7c8ffda9059") - solver.set_website_url(RDV_URL) - solver.set_template_name("Anti-bot screen bypass") - solver.set_variables({ - "css_selector": ".captcha__human__container" - }) - result = solver.solve_and_return_solution() - if result != 0: - cookies, localStorage, fingerprint, url, domain = result["cookies"], result["localStorage"], result[ - "fingerprint"], result["url"], result["domain"] - print("cookies: ", cookies) - print("localStorage: ", localStorage) - print("fingerprint: ", fingerprint) - print("url: " + url) - print("domain: " + domain) - # add cookies to playwright - cookie_list = [] - cookie_list.append(SetCookieParam(name='datadome', value=cookies['datadome'], url=url)) - self.page.context.add_cookies(cookie_list) - self.config_page_with_fingerprint(fingerprint) - self.page.reload() - else: - print("task finished with error " + solver.error_code) + solver = GeoCaptchaSolver(self.page) + # time.sleep(2) + # self._click_audio_btn() + # time.sleep(1) + solver.solve() + # solver = antigateTask() + # solver.set_verbose(1) + # solver.set_key("ede6a69396fc961af351e7c8ffda9059") + # solver.set_website_url(RDV_URL) + # solver.set_template_name("Anti-bot screen bypass") + # solver.set_variables({ + # "css_selector": ".captcha__human__container" + # }) + # result = solver.solve_and_return_solution() + # if result != 0: + # cookies, localStorage, fingerprint, url, domain = result["cookies"], result["localStorage"], result[ + # "fingerprint"], result["url"], result["domain"] + # print("cookies: ", cookies) + # print("localStorage: ", localStorage) + # print("fingerprint: ", fingerprint) + # print("url: " + url) + # print("domain: " + domain) + # # add cookies to playwright + # cookie_list = [] + # cookie_list.append(SetCookieParam(name='datadome', value=cookies['datadome'], url=url)) + # self.page.context.add_cookies(cookie_list) + # self.config_page_with_fingerprint(fingerprint) + # self.page.reload() + # else: + # print("task finished with error " + solver.error_code) def _on_page_loaded(self): # time.sleep(40000) @@ -236,8 +218,8 @@ class CommandorPage: self.get_errors() except Exception as error: self.logger.error(error) - # else: - # self.solve_datadome_captcha() + else: + self.solve_datadome_captcha() def on_document_loaded(self): self.logger.info("on_document_loaded called") @@ -280,6 +262,13 @@ class CommandorPage: except Exception as error: self.logger.error(error) + def _click_audio_btn(self): + time.sleep(get_random_wait_time()) + self.page.evaluate("""{ + let surname = document.getElementById('captcha__audio__button'); + surname.click();} + """) + def get_errors(self): # send error result if self.page.url != BLANK_URL: