new method to extract names

This commit is contained in:
2023-05-11 00:23:51 +02:00
parent 2ba53ea677
commit 2e45143c61
5 changed files with 131 additions and 66 deletions
+7 -3
View File
@@ -5,6 +5,7 @@ from imapclient import IMAPClient
from src.db.mongo_manager import MONGO_STORE_MANAGER from src.db.mongo_manager import MONGO_STORE_MANAGER
from src.mail.mail_constants import create_imap from src.mail.mail_constants import create_imap
from src.pojo.mail.mail_pojo import MailAddress from src.pojo.mail.mail_pojo import MailAddress
from src.utils.excel_reader import ExcelHelper
class MailAddressValidator(): class MailAddressValidator():
@@ -49,8 +50,7 @@ def remove_invalid_email():
MONGO_STORE_MANAGER.remove_email_from_destination_email_list(mail) MONGO_STORE_MANAGER.remove_email_from_destination_email_list(mail)
def find_and_update_invalid_emails(): def find_and_update_invalid_emails(mail_list):
mail_list = MONGO_STORE_MANAGER.get_destination_emails()
# mail_address1 = MailAddress(mail="perrateke1983@onet.pl", password="8EQh#UuyMx8zVO9") # mail_address1 = MailAddress(mail="perrateke1983@onet.pl", password="8EQh#UuyMx8zVO9")
# # mail_address2 = MailAddress(mail="chenpeijun@aol.com", password="ytifuwguknzifqyb") # # mail_address2 = MailAddress(mail="chenpeijun@aol.com", password="ytifuwguknzifqyb")
# # mail_address2 = MailAddress(mail="sdfgfhgf1986@aol.com", password="fjwcgvhxxlywqfwm") # # mail_address2 = MailAddress(mail="sdfgfhgf1986@aol.com", password="fjwcgvhxxlywqfwm")
@@ -67,4 +67,8 @@ def find_and_update_invalid_emails():
if __name__ == '__main__': if __name__ == '__main__':
# remove_invalid_email() # remove_invalid_email()
find_and_update_invalid_emails() mail_list = MONGO_STORE_MANAGER.get_destination_emails()
# excel_reader = ExcelHelper()
# mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Downloads/hotmail_list.xlsx")
# print(email_list)
find_and_update_invalid_emails(mail_list)
+6 -3
View File
@@ -9,11 +9,12 @@ from imapclient import IMAPClient
from src.db.mirgration.migration_tools import migre_accepted_appointment from src.db.mirgration.migration_tools import migre_accepted_appointment
from src.db.mongo_manager import MONGO_STORE_MANAGER from src.db.mongo_manager import MONGO_STORE_MANAGER
from src.mail.mail_constants import DOMAIN_HOTMAIL, create_imap, show_folders from src.mail.mail_constants import create_imap, show_folders
from src.notification.AcceptedResultPojo import get_accepted_result_from from src.notification.AcceptedResultPojo import get_accepted_result_from
from src.notification.mailer import Mailer from src.notification.mailer import Mailer
from src.pojo.ResultEnum import ResultEnum from src.pojo.ResultEnum import ResultEnum
from src.pojo.mail.mail_pojo import MailPojo, MailAddress from src.pojo.mail.mail_pojo import MailPojo
from src.utils.excel_reader import ExcelHelper
CONFIRMATION_SUBJECT_FR = 'Votre rendez-vous est' CONFIRMATION_SUBJECT_FR = 'Votre rendez-vous est'
CONFIRMATION_SUBJECT_EN = 'appointment is confirmed' CONFIRMATION_SUBJECT_EN = 'appointment is confirmed'
@@ -155,13 +156,15 @@ def accept_appointment_found(accepted_result_list: list):
def read_mails_and_find_confirmation_contacts(): def read_mails_and_find_confirmation_contacts():
mail_list = MONGO_STORE_MANAGER.get_destination_emails() mail_list = MONGO_STORE_MANAGER.get_destination_emails()
# excel_reader = ExcelHelper()
# mail_list =excel_reader.read_email_pojo(file_name="/Users/lpan/Desktop/hotmail_list.xlsx")
# mail_address3 = MailAddress(mail="taibenchragu1978@onet.pl", password="2J)kyfNgyOZ") # mail_address3 = MailAddress(mail="taibenchragu1978@onet.pl", password="2J)kyfNgyOZ")
# mail_list = [mail_address3] # mail_list = [mail_address3]
mails_messages = [] mails_messages = []
# read all the emails # read all the emails
with ThreadPoolExecutor(max_workers=200) as executor: with ThreadPoolExecutor(max_workers=200) as executor:
for mail in mail_list: for mail in mail_list:
if DOMAIN_HOTMAIL not in mail.mail: # if DOMAIN_HOTMAIL not in mail.mail:
mail_reader = MailConfirmationReader(mail.mail, mail.password) mail_reader = MailConfirmationReader(mail.mail, mail.password)
executor.submit(mail_reader.read_emails, mails_messages) executor.submit(mail_reader.read_emails, mails_messages)
accepted_appointment_list = [] accepted_appointment_list = []
+8 -4
View File
@@ -13,7 +13,9 @@ from src.db.mongo_manager import MONGO_STORE_MANAGER
from src.logs.AppLogging import init_logger from src.logs.AppLogging import init_logger
from src.mail.mail_constants import DOMAIN_HOTMAIL, create_imap from src.mail.mail_constants import DOMAIN_HOTMAIL, create_imap
from src.pojo.mail.mail_pojo import MailPojo, MailAddress from src.pojo.mail.mail_pojo import MailPojo, MailAddress
from src.utils.excel_reader import ExcelHelper
from src.utils.timeutiles import is_time_between from src.utils.timeutiles import is_time_between
from src.workers.link_validator import LinkValidator
VALIDATION_URL_SUBJECT_fr = 'Validation de votre demande de rendez-vous' VALIDATION_URL_SUBJECT_fr = 'Validation de votre demande de rendez-vous'
VALIDATION_URL_SUBJECT_EN = 'Please confirm your appointment request' VALIDATION_URL_SUBJECT_EN = 'Please confirm your appointment request'
@@ -87,9 +89,9 @@ class MailReader():
folder=folder)) folder=folder))
else: else:
mail_list.extend(self._get_messages_from_folder_for_imapclient(imap)) mail_list.extend(self._get_messages_from_folder_for_imapclient(imap))
if DOMAIN_HOTMAIL in self.login: # if DOMAIN_HOTMAIL in self.login:
mail_list.extend( # mail_list.extend(
self._get_messages_from_folder_for_imapclient(imap, folder="Junk")) # self._get_messages_from_folder_for_imapclient(imap, folder="Junk"))
if not isImapClient: if not isImapClient:
imap.close() imap.close()
imap.logout() imap.logout()
@@ -245,8 +247,10 @@ def read_mails():
if is_time_between(time(7, 30), time(19, 30)): if is_time_between(time(7, 30), time(19, 30)):
# get email address # get email address
mail_list = MONGO_STORE_MANAGER.get_destination_emails() mail_list = MONGO_STORE_MANAGER.get_destination_emails()
# excel_reader = ExcelHelper()
# mail_list =excel_reader.read_email_pojo(file_name="/Users/lpan/Desktop/hotmail_list.xlsx")
# mail_address1 = MailAddress(mail="appointment2022@aol.com", password="gyilpmvyyvlcaviq") # mail_address1 = MailAddress(mail="appointment2022@aol.com", password="gyilpmvyyvlcaviq")
# mail_address1 = MailAddress(mail="chenpeijun@aol.com", password="ytifuwguknzifqyb") # mail_address1 = MailAddress(mail="sayedyepesv@hotmail.com", password="JGc1UH41")
# # mail_address3 = MailAddress(mail="ciyuexie@aol.com", password="czezlmmyypokdfce") # # mail_address3 = MailAddress(mail="ciyuexie@aol.com", password="czezlmmyypokdfce")
# mail_list = [mail_address1] # mail_list = [mail_address1]
successful_items = MONGO_STORE_MANAGER.get_all_successful_items_for_day() successful_items = MONGO_STORE_MANAGER.get_all_successful_items_for_day()
+106 -49
View File
@@ -29,7 +29,28 @@ def get_better_list(list):
# 关键词提取 # 关键词提取
def read_pinyin_list_from_file() -> list: def read_pinyin_list_from_file() -> list:
file2 = open('clean_list.txt', 'r') file2 = open('all_new_name_list.txt', 'r')
lines = file2.readlines()
name_list = []
count = 0
for line in lines:
count += 1
print("Line{}: {}".format(count, line.strip()))
name_list.append(line.strip())
return name_list
def read_pinyin_first_name_from_file() -> list:
file2 = open('first_name_noDuplicates.txt', 'r')
lines = file2.readlines()
name_list = []
count = 0
for line in lines:
count += 1
print("Line{}: {}".format(count, line.strip()))
name_list.append(line.strip())
return name_list
def read_pinyin_last_name_from_file() -> list:
file2 = open('last_name_noDuplicates.txt', 'r')
lines = file2.readlines() lines = file2.readlines()
name_list = [] name_list = []
count = 0 count = 0
@@ -40,57 +61,88 @@ def read_pinyin_list_from_file() -> list:
return name_list return name_list
def generate_new_list_from_old_name_list():
all_last_name = read_pinyin_last_name_from_file()
all_first_name = read_pinyin_first_name_from_file()
# for name in all_name_list:
# last_name = name.split(" ")[0]
# first_name = name.split(" ")[-1]
# all_last_name.append(last_name)
# all_first_name.append(first_name)
f = open("all_new_name_list.txt", "w")
for last_name in all_last_name:
for first_name in all_first_name:
new_name = "{} {}\n".format(last_name, first_name)
f.write(str(new_name))
f.close()
def has_numbers(inputString):
return any(char.isdigit() for char in inputString)
def check_name(word_to_test, pinyin_name_list):
if "_" in word_to_test or "." in word_to_test:
return None
if has_numbers(word_to_test):
return None
for name in pinyin_name_list:
last_name = name.split(" ")[0]
first_name = name.split(" ")[-1]
full_name = last_name + first_name
full_name_inverse = first_name + last_name
if word_to_test.lower() in last_name.lower():
return last_name, first_name
elif word_to_test.lower() in first_name.lower():
return last_name, first_name
elif word_to_test.lower() in full_name.lower():
return last_name, first_name
elif word_to_test.lower() in full_name_inverse.lower():
return last_name, first_name
return None
def generate_name_from_email(mail_address, pinyin_name_list): def generate_name_from_email(mail_address, pinyin_name_list):
# key_words = HanLP.extractKeyword(mail_address, 2) # key_words = HanLP.extractKeyword(mail_address, 2)
# print(key_words) # print(key_words)
# setence = "".join(key_words) # setence = "".join(key_words)
print("generate for " + mail_address) print("generate for " + mail_address)
all_combins = get_ordered_combins(mail_address) all_combins = get_ordered_combins(mail_address)
all_combins.sort(key=len, reverse=True)
print(all_combins)
for i in all_combins: for i in all_combins:
word_to_test = "".join(i) word_to_test = "".join(i)
print("word to test is " + word_to_test) print("word to test is " + word_to_test)
if len(word_to_test) >= 6: # if len(word_to_test) >= 11:
for name in pinyin_name_list: # found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
last_name = name.split(" ")[0] # if found_name is not None:
first_name = name.split(" ")[-1] # return found_name
full_name = last_name + first_name # if len(word_to_test) >= 10:
full_name_inverse = first_name + last_name # found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
if word_to_test in last_name: # if found_name is not None:
return last_name, first_name # return found_name
elif word_to_test in first_name: # if len(word_to_test) >= 9:
return last_name, first_name # found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
elif word_to_test in full_name: # if found_name is not None:
return last_name, first_name # return found_name
elif word_to_test in full_name_inverse: # if len(word_to_test) >= 8:
return last_name, first_name # found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
if len(word_to_test) >= 5: # if found_name is not None:
for name in pinyin_name_list: # return found_name
last_name = name.split(" ")[0] # if len(word_to_test) >= 7:
first_name = name.split(" ")[-1] # found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
full_name = last_name + first_name # if found_name is not None:
full_name_inverse = first_name + last_name # return found_name
if word_to_test in last_name: # if len(word_to_test) >= 6:
return last_name, first_name # found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
elif word_to_test in first_name: # if found_name is not None:
return last_name, first_name # return found_name
elif word_to_test in full_name: # if len(word_to_test) >= 5:
return last_name, first_name # found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
elif word_to_test in full_name_inverse: # if found_name is not None:
return last_name, first_name # return found_name
if len(word_to_test) >= 4: if len(word_to_test) >= 4:
for name in pinyin_name_list: found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
last_name = name.split(" ")[0] if found_name is not None:
first_name = name.split(" ")[-1] return found_name
full_name = last_name + first_name
full_name_inverse = first_name + last_name
if word_to_test in last_name:
return last_name, first_name
elif word_to_test in first_name:
return last_name, first_name
elif word_to_test in full_name:
return last_name, first_name
elif word_to_test in full_name_inverse:
return last_name, first_name
# 选择不重复的 # 选择不重复的
# if len(pinyin_name_list) > 3: # if len(pinyin_name_list) > 3:
@@ -124,7 +176,7 @@ def write_new_contacts_to_excel(valid_contacts: list):
workbook.close() workbook.close()
def find_contact(generate_contacts: list, mail): def find_contact(generate_contacts: list, mail, pinyin_name_list):
contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="")
spliteed = mail.mail.split("@") spliteed = mail.mail.split("@")
possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list) possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list)
@@ -135,19 +187,24 @@ def find_contact(generate_contacts: list, mail):
generate_contacts.append(contact) generate_contacts.append(contact)
if __name__ == '__main__': def generate_contact_from_mail_list():
db_manager = MongoDbManager() db_manager = MongoDbManager()
# mail_list = db_manager.get_destination_emails()[6000:7000] # mail_list = db_manager.get_destination_emails()[6000:7000]
# mail_list = db_manager.get_destination_emails()[3001:3200] # mail_list = db_manager.get_destination_emails()[3001:3200]
# mail_list = db_manager.get_destination_emails()[50:200] mail_list = db_manager.get_destination_emails()[7570:7590]
excel_reader = ExcelHelper() # excel_reader = ExcelHelper()
mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/yahoo_list.xlsx") # mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/yahoo_list.xlsx")
generate_contacts = [] generate_contacts = []
pinyin_name_list = read_pinyin_list_from_file() pinyin_name_list = read_pinyin_list_from_file()
random.shuffle(pinyin_name_list) random.shuffle(pinyin_name_list)
with ThreadPoolExecutor(max_workers=200) as executor: with ThreadPoolExecutor(max_workers=500) as executor:
for mail in mail_list: for mail in mail_list:
executor.submit(find_contact, generate_contacts, mail) executor.submit(find_contact, generate_contacts, mail, pinyin_name_list)
# for mail in mail_list: # for mail in mail_list:
# find_contact(generate_contacts, mail) # find_contact(generate_contacts, mail)
write_new_contacts_to_excel(generate_contacts) write_new_contacts_to_excel(generate_contacts)
if __name__ == '__main__':
generate_contact_from_mail_list()
# generate_new_list_from_old_name_list()
+2 -5
View File
@@ -24,10 +24,9 @@ DOUBLE_REQUEST_ERROR_MESSAGE_FR = "Une demande avec les données saisies a déj
class LinkValidator: class LinkValidator:
tls = TlsPlaywright() tls = TlsPlaywright()
def __init__(self, link: str, proxy_type=ProxyType.RESIDENTIAL, headless=False): def __init__(self, link: str, headless=False):
self.is_finished = False self.is_finished = False
self.link = link self.link = link
self.proxy_type = proxy_type
self.is_event_sent = False self.is_event_sent = False
self.is_captcha_in_error = False self.is_captcha_in_error = False
self.is_filling_fields = False self.is_filling_fields = False
@@ -50,10 +49,8 @@ class LinkValidator:
def _run(self, proxy): def _run(self, proxy):
self.logger.info("will start browser") self.logger.info("will start browser")
# reset otp_value to None # reset otp_value to None
devices = random.choice(params.DEVICES)
first_page = None
# while first_page is None: # while first_page is None:
self.start_browser(proxy, self.tls.playwright, devices) self.start_browser(self.tls.playwright)
# proxy = params.get_proxy(self.proxy_type) # proxy = params.get_proxy(self.proxy_type)
otp_input = self.page.locator(OTP_FIELD_ID) otp_input = self.page.locator(OTP_FIELD_ID)
otp_input.wait_for(state='visible', timeout=TIME_OUT) otp_input.wait_for(state='visible', timeout=TIME_OUT)