new method to extract names

This commit is contained in:
2023-05-11 00:23:51 +02:00
parent 2ba53ea677
commit 2e45143c61
5 changed files with 131 additions and 66 deletions
+7 -3
View File
@@ -5,6 +5,7 @@ from imapclient import IMAPClient
from src.db.mongo_manager import MONGO_STORE_MANAGER
from src.mail.mail_constants import create_imap
from src.pojo.mail.mail_pojo import MailAddress
from src.utils.excel_reader import ExcelHelper
class MailAddressValidator():
@@ -49,8 +50,7 @@ def remove_invalid_email():
MONGO_STORE_MANAGER.remove_email_from_destination_email_list(mail)
def find_and_update_invalid_emails():
mail_list = MONGO_STORE_MANAGER.get_destination_emails()
def find_and_update_invalid_emails(mail_list):
# mail_address1 = MailAddress(mail="perrateke1983@onet.pl", password="8EQh#UuyMx8zVO9")
# # mail_address2 = MailAddress(mail="chenpeijun@aol.com", password="ytifuwguknzifqyb")
# # mail_address2 = MailAddress(mail="sdfgfhgf1986@aol.com", password="fjwcgvhxxlywqfwm")
@@ -67,4 +67,8 @@ def find_and_update_invalid_emails():
if __name__ == '__main__':
# remove_invalid_email()
find_and_update_invalid_emails()
mail_list = MONGO_STORE_MANAGER.get_destination_emails()
# excel_reader = ExcelHelper()
# mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Downloads/hotmail_list.xlsx")
# print(email_list)
find_and_update_invalid_emails(mail_list)
+8 -5
View File
@@ -9,11 +9,12 @@ from imapclient import IMAPClient
from src.db.mirgration.migration_tools import migre_accepted_appointment
from src.db.mongo_manager import MONGO_STORE_MANAGER
from src.mail.mail_constants import DOMAIN_HOTMAIL, create_imap, show_folders
from src.mail.mail_constants import create_imap, show_folders
from src.notification.AcceptedResultPojo import get_accepted_result_from
from src.notification.mailer import Mailer
from src.pojo.ResultEnum import ResultEnum
from src.pojo.mail.mail_pojo import MailPojo, MailAddress
from src.pojo.mail.mail_pojo import MailPojo
from src.utils.excel_reader import ExcelHelper
CONFIRMATION_SUBJECT_FR = 'Votre rendez-vous est'
CONFIRMATION_SUBJECT_EN = 'appointment is confirmed'
@@ -155,15 +156,17 @@ def accept_appointment_found(accepted_result_list: list):
def read_mails_and_find_confirmation_contacts():
mail_list = MONGO_STORE_MANAGER.get_destination_emails()
# excel_reader = ExcelHelper()
# mail_list =excel_reader.read_email_pojo(file_name="/Users/lpan/Desktop/hotmail_list.xlsx")
# mail_address3 = MailAddress(mail="taibenchragu1978@onet.pl", password="2J)kyfNgyOZ")
# mail_list = [mail_address3]
mails_messages = []
# read all the emails
with ThreadPoolExecutor(max_workers=200) as executor:
for mail in mail_list:
if DOMAIN_HOTMAIL not in mail.mail:
mail_reader = MailConfirmationReader(mail.mail, mail.password)
executor.submit(mail_reader.read_emails, mails_messages)
# if DOMAIN_HOTMAIL not in mail.mail:
mail_reader = MailConfirmationReader(mail.mail, mail.password)
executor.submit(mail_reader.read_emails, mails_messages)
accepted_appointment_list = []
if len(mails_messages) > 0:
successful_items = MONGO_STORE_MANAGER.get_all_successful_items_for_day()
+8 -4
View File
@@ -13,7 +13,9 @@ from src.db.mongo_manager import MONGO_STORE_MANAGER
from src.logs.AppLogging import init_logger
from src.mail.mail_constants import DOMAIN_HOTMAIL, create_imap
from src.pojo.mail.mail_pojo import MailPojo, MailAddress
from src.utils.excel_reader import ExcelHelper
from src.utils.timeutiles import is_time_between
from src.workers.link_validator import LinkValidator
VALIDATION_URL_SUBJECT_fr = 'Validation de votre demande de rendez-vous'
VALIDATION_URL_SUBJECT_EN = 'Please confirm your appointment request'
@@ -87,9 +89,9 @@ class MailReader():
folder=folder))
else:
mail_list.extend(self._get_messages_from_folder_for_imapclient(imap))
if DOMAIN_HOTMAIL in self.login:
mail_list.extend(
self._get_messages_from_folder_for_imapclient(imap, folder="Junk"))
# if DOMAIN_HOTMAIL in self.login:
# mail_list.extend(
# self._get_messages_from_folder_for_imapclient(imap, folder="Junk"))
if not isImapClient:
imap.close()
imap.logout()
@@ -245,8 +247,10 @@ def read_mails():
if is_time_between(time(7, 30), time(19, 30)):
# get email address
mail_list = MONGO_STORE_MANAGER.get_destination_emails()
# excel_reader = ExcelHelper()
# mail_list =excel_reader.read_email_pojo(file_name="/Users/lpan/Desktop/hotmail_list.xlsx")
# mail_address1 = MailAddress(mail="appointment2022@aol.com", password="gyilpmvyyvlcaviq")
# mail_address1 = MailAddress(mail="chenpeijun@aol.com", password="ytifuwguknzifqyb")
# mail_address1 = MailAddress(mail="sayedyepesv@hotmail.com", password="JGc1UH41")
# # mail_address3 = MailAddress(mail="ciyuexie@aol.com", password="czezlmmyypokdfce")
# mail_list = [mail_address1]
successful_items = MONGO_STORE_MANAGER.get_all_successful_items_for_day()
+106 -49
View File
@@ -29,7 +29,28 @@ def get_better_list(list):
# 关键词提取
def read_pinyin_list_from_file() -> list:
file2 = open('clean_list.txt', 'r')
file2 = open('all_new_name_list.txt', 'r')
lines = file2.readlines()
name_list = []
count = 0
for line in lines:
count += 1
print("Line{}: {}".format(count, line.strip()))
name_list.append(line.strip())
return name_list
def read_pinyin_first_name_from_file() -> list:
file2 = open('first_name_noDuplicates.txt', 'r')
lines = file2.readlines()
name_list = []
count = 0
for line in lines:
count += 1
print("Line{}: {}".format(count, line.strip()))
name_list.append(line.strip())
return name_list
def read_pinyin_last_name_from_file() -> list:
file2 = open('last_name_noDuplicates.txt', 'r')
lines = file2.readlines()
name_list = []
count = 0
@@ -40,57 +61,88 @@ def read_pinyin_list_from_file() -> list:
return name_list
def generate_new_list_from_old_name_list():
all_last_name = read_pinyin_last_name_from_file()
all_first_name = read_pinyin_first_name_from_file()
# for name in all_name_list:
# last_name = name.split(" ")[0]
# first_name = name.split(" ")[-1]
# all_last_name.append(last_name)
# all_first_name.append(first_name)
f = open("all_new_name_list.txt", "w")
for last_name in all_last_name:
for first_name in all_first_name:
new_name = "{} {}\n".format(last_name, first_name)
f.write(str(new_name))
f.close()
def has_numbers(inputString):
return any(char.isdigit() for char in inputString)
def check_name(word_to_test, pinyin_name_list):
if "_" in word_to_test or "." in word_to_test:
return None
if has_numbers(word_to_test):
return None
for name in pinyin_name_list:
last_name = name.split(" ")[0]
first_name = name.split(" ")[-1]
full_name = last_name + first_name
full_name_inverse = first_name + last_name
if word_to_test.lower() in last_name.lower():
return last_name, first_name
elif word_to_test.lower() in first_name.lower():
return last_name, first_name
elif word_to_test.lower() in full_name.lower():
return last_name, first_name
elif word_to_test.lower() in full_name_inverse.lower():
return last_name, first_name
return None
def generate_name_from_email(mail_address, pinyin_name_list):
# key_words = HanLP.extractKeyword(mail_address, 2)
# print(key_words)
# setence = "".join(key_words)
print("generate for " + mail_address)
all_combins = get_ordered_combins(mail_address)
all_combins.sort(key=len, reverse=True)
print(all_combins)
for i in all_combins:
word_to_test = "".join(i)
print("word to test is " + word_to_test)
if len(word_to_test) >= 6:
for name in pinyin_name_list:
last_name = name.split(" ")[0]
first_name = name.split(" ")[-1]
full_name = last_name + first_name
full_name_inverse = first_name + last_name
if word_to_test in last_name:
return last_name, first_name
elif word_to_test in first_name:
return last_name, first_name
elif word_to_test in full_name:
return last_name, first_name
elif word_to_test in full_name_inverse:
return last_name, first_name
if len(word_to_test) >= 5:
for name in pinyin_name_list:
last_name = name.split(" ")[0]
first_name = name.split(" ")[-1]
full_name = last_name + first_name
full_name_inverse = first_name + last_name
if word_to_test in last_name:
return last_name, first_name
elif word_to_test in first_name:
return last_name, first_name
elif word_to_test in full_name:
return last_name, first_name
elif word_to_test in full_name_inverse:
return last_name, first_name
# if len(word_to_test) >= 11:
# found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
# if found_name is not None:
# return found_name
# if len(word_to_test) >= 10:
# found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
# if found_name is not None:
# return found_name
# if len(word_to_test) >= 9:
# found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
# if found_name is not None:
# return found_name
# if len(word_to_test) >= 8:
# found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
# if found_name is not None:
# return found_name
# if len(word_to_test) >= 7:
# found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
# if found_name is not None:
# return found_name
# if len(word_to_test) >= 6:
# found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
# if found_name is not None:
# return found_name
# if len(word_to_test) >= 5:
# found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
# if found_name is not None:
# return found_name
if len(word_to_test) >= 4:
for name in pinyin_name_list:
last_name = name.split(" ")[0]
first_name = name.split(" ")[-1]
full_name = last_name + first_name
full_name_inverse = first_name + last_name
if word_to_test in last_name:
return last_name, first_name
elif word_to_test in first_name:
return last_name, first_name
elif word_to_test in full_name:
return last_name, first_name
elif word_to_test in full_name_inverse:
return last_name, first_name
found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
if found_name is not None:
return found_name
# 选择不重复的
# if len(pinyin_name_list) > 3:
@@ -124,7 +176,7 @@ def write_new_contacts_to_excel(valid_contacts: list):
workbook.close()
def find_contact(generate_contacts: list, mail):
def find_contact(generate_contacts: list, mail, pinyin_name_list):
contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="")
spliteed = mail.mail.split("@")
possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list)
@@ -135,19 +187,24 @@ def find_contact(generate_contacts: list, mail):
generate_contacts.append(contact)
if __name__ == '__main__':
def generate_contact_from_mail_list():
db_manager = MongoDbManager()
# mail_list = db_manager.get_destination_emails()[6000:7000]
# mail_list = db_manager.get_destination_emails()[3001:3200]
# mail_list = db_manager.get_destination_emails()[50:200]
excel_reader = ExcelHelper()
mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/yahoo_list.xlsx")
mail_list = db_manager.get_destination_emails()[7570:7590]
# excel_reader = ExcelHelper()
# mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/yahoo_list.xlsx")
generate_contacts = []
pinyin_name_list = read_pinyin_list_from_file()
random.shuffle(pinyin_name_list)
with ThreadPoolExecutor(max_workers=200) as executor:
with ThreadPoolExecutor(max_workers=500) as executor:
for mail in mail_list:
executor.submit(find_contact, generate_contacts, mail)
executor.submit(find_contact, generate_contacts, mail, pinyin_name_list)
# for mail in mail_list:
# find_contact(generate_contacts, mail)
write_new_contacts_to_excel(generate_contacts)
if __name__ == '__main__':
generate_contact_from_mail_list()
# generate_new_list_from_old_name_list()
+2 -5
View File
@@ -24,10 +24,9 @@ DOUBLE_REQUEST_ERROR_MESSAGE_FR = "Une demande avec les données saisies a déj
class LinkValidator:
tls = TlsPlaywright()
def __init__(self, link: str, proxy_type=ProxyType.RESIDENTIAL, headless=False):
def __init__(self, link: str, headless=False):
self.is_finished = False
self.link = link
self.proxy_type = proxy_type
self.is_event_sent = False
self.is_captcha_in_error = False
self.is_filling_fields = False
@@ -50,10 +49,8 @@ class LinkValidator:
def _run(self, proxy):
self.logger.info("will start browser")
# reset otp_value to None
devices = random.choice(params.DEVICES)
first_page = None
# while first_page is None:
self.start_browser(proxy, self.tls.playwright, devices)
self.start_browser(self.tls.playwright)
# proxy = params.get_proxy(self.proxy_type)
otp_input = self.page.locator(OTP_FIELD_ID)
otp_input.wait_for(state='visible', timeout=TIME_OUT)