From 48bd46f08e98be1db978f4a3947ad02938a223d2 Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Sat, 15 Apr 2023 22:35:22 +0200 Subject: [PATCH 01/10] try to extract name from email address --- src/utils/extract_name.py | 100 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 src/utils/extract_name.py diff --git a/src/utils/extract_name.py b/src/utils/extract_name.py new file mode 100644 index 0000000..b04bde5 --- /dev/null +++ b/src/utils/extract_name.py @@ -0,0 +1,100 @@ +import itertools + +import xlsxwriter +from pyhanlp import * +from Pinyin2Hanzi import DefaultHmmParams +from Pinyin2Hanzi import viterbi +from itertools import combinations + +from src.db.mongo_manager import MongoDbManager +from src.pojo.contact_pojo import ContactPojo +from src.utils.excel_reader import get_random_phone_numbers, generate_email_from_name +from src.utils.generate_random_passport_id import get_random_passport_id_number + + +def get_ordered_combins(stuff): + list_to_return = [] + for i, j in itertools.combinations(range(len(stuff) + 1), 2): + print(stuff[i:j]) + list_to_return.append(stuff[i:j]) + return list_to_return + + +def get_better_list(list): + for name in list: + if len(name) == 2: + list.remove(name) + return list + + +# 关键词提取 + + +def generate_name_from_email(mail_address): + key_words = HanLP.extractKeyword(mail_address, 2) + print(key_words) + hmmparams = DefaultHmmParams() + possible_name_list = [] + setence = "".join(key_words) + all_combins = get_ordered_combins(setence) + for i in all_combins: + word_to_test = "".join(i) + if len(word_to_test) >= 2: + # print("word to test is " + word_to_test) + try: + result = viterbi(hmm_params=hmmparams, observations=(word_to_test,), path_num=2) + for item in result: + print(item.score, item.path) + # print("word is " + word_to_test) + if len(word_to_test) >= 3: + all_combins.remove(i) + possible_name_list.append(word_to_test) + except Exception as error: + print(error) + + print(possible_name_list) + # 选择不重复的 + if len(possible_name_list) > 3: + return get_better_list(possible_name_list) + else: + return possible_name_list + + +def write_new_contacts_to_excel(valid_contacts: list): + row = 0 + col = 0 + # Create a workbook and add a worksheet. + workbook = xlsxwriter.Workbook('real_name_contacts_{}.xlsx'.format(len(valid_contacts))) + header_data = ['name', 'phone', 'passport', 'email'] + worksheet = workbook.add_worksheet() + header_format = workbook.add_format({'bold': True}) + + for col_num, data in enumerate(header_data): + worksheet.write(row, col_num, data, header_format) + row = row + 1 + for info in valid_contacts: + info.phone = get_random_phone_numbers() + info.passport = get_random_passport_id_number() + # Iterate over the data and write it out row by row. + worksheet.write(row, col, "{} {}".format(info.last_name, info.first_name)) + worksheet.write(row, col + 1, info.phone) + worksheet.write(row, col + 2, info.passport) + worksheet.write(row, col + 3, info.mail) + row += 1 + workbook.close() + + +if __name__ == '__main__': + db_manager = MongoDbManager() + mail_list = db_manager.get_destination_emails()[0:50] + generate_contacts = [] + for mail in mail_list: + contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") + spliteed = mail.mail.split("@") + possible_name_list = generate_name_from_email(spliteed[0]) + if len(possible_name_list) >= 2: + contact.last_name = possible_name_list[0] + contact.first_name = "".join(possible_name_list[1:-1]) + if len(contact.last_name) > 0 and len(contact.first_name) > 0: + generate_contacts.append(contact) + write_new_contacts_to_excel(generate_contacts) From 86fdb4b62c01f6dd9bff4be91b46d6336e8308ec Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Mon, 17 Apr 2023 14:04:27 +0200 Subject: [PATCH 02/10] extract name for 50-200 --- src/utils/extract_name.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/extract_name.py b/src/utils/extract_name.py index b04bde5..ea47ef8 100644 --- a/src/utils/extract_name.py +++ b/src/utils/extract_name.py @@ -86,7 +86,7 @@ def write_new_contacts_to_excel(valid_contacts: list): if __name__ == '__main__': db_manager = MongoDbManager() - mail_list = db_manager.get_destination_emails()[0:50] + mail_list = db_manager.get_destination_emails()[50:200] generate_contacts = [] for mail in mail_list: contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") From 7bbdf6ca9cf46622abd2f58f9366dc2196762860 Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Tue, 18 Apr 2023 20:23:14 +0200 Subject: [PATCH 03/10] add note in the excel --- src/pojo/contact_pojo.py | 2 ++ src/utils/extract_name.py | 39 +++++++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/pojo/contact_pojo.py b/src/pojo/contact_pojo.py index 3e1fe02..30cb475 100644 --- a/src/pojo/contact_pojo.py +++ b/src/pojo/contact_pojo.py @@ -12,6 +12,7 @@ class ContactPojo: mail: str ccid: str position: int + note: str def __init__(self, phone_number: str, passport_number: str, last_name: str, first_name: str, mail: str, ccid: str = "", @@ -23,6 +24,7 @@ class ContactPojo: self.ccid = ccid self.mail = mail self.position = position + self.note = "" def to_firestore_dict(self): dest = { diff --git a/src/utils/extract_name.py b/src/utils/extract_name.py index ea47ef8..94b3fe2 100644 --- a/src/utils/extract_name.py +++ b/src/utils/extract_name.py @@ -21,10 +21,11 @@ def get_ordered_combins(stuff): def get_better_list(list): - for name in list: - if len(name) == 2: - list.remove(name) - return list + return list + # for name in list: + # if len(name) == 2: + # list.remove(name) + # return list # 关键词提取 @@ -34,7 +35,8 @@ def generate_name_from_email(mail_address): key_words = HanLP.extractKeyword(mail_address, 2) print(key_words) hmmparams = DefaultHmmParams() - possible_name_list = [] + pinyin_name_list = [] + chinese_name_list = [] setence = "".join(key_words) all_combins = get_ordered_combins(setence) for i in all_combins: @@ -45,19 +47,20 @@ def generate_name_from_email(mail_address): result = viterbi(hmm_params=hmmparams, observations=(word_to_test,), path_num=2) for item in result: print(item.score, item.path) + chinese_name_list.extend(item.path) # print("word is " + word_to_test) - if len(word_to_test) >= 3: - all_combins.remove(i) - possible_name_list.append(word_to_test) + # if len(word_to_test) >= 3: + # all_combins.remove(i) + pinyin_name_list.append(word_to_test) except Exception as error: print(error) - print(possible_name_list) + print(pinyin_name_list) # 选择不重复的 - if len(possible_name_list) > 3: - return get_better_list(possible_name_list) - else: - return possible_name_list + # if len(pinyin_name_list) > 3: + # return get_better_list(pinyin_name_list) + # else: + return pinyin_name_list, chinese_name_list def write_new_contacts_to_excel(valid_contacts: list): @@ -65,7 +68,7 @@ def write_new_contacts_to_excel(valid_contacts: list): col = 0 # Create a workbook and add a worksheet. workbook = xlsxwriter.Workbook('real_name_contacts_{}.xlsx'.format(len(valid_contacts))) - header_data = ['name', 'phone', 'passport', 'email'] + header_data = ['name', 'phone', 'passport', 'email', 'note'] worksheet = workbook.add_worksheet() header_format = workbook.add_format({'bold': True}) @@ -80,21 +83,25 @@ def write_new_contacts_to_excel(valid_contacts: list): worksheet.write(row, col + 1, info.phone) worksheet.write(row, col + 2, info.passport) worksheet.write(row, col + 3, info.mail) + worksheet.write(row, col + 4, info.note) row += 1 workbook.close() if __name__ == '__main__': db_manager = MongoDbManager() - mail_list = db_manager.get_destination_emails()[50:200] + mail_list = db_manager.get_destination_emails()[301:500] + # mail_list = db_manager.get_destination_emails()[50:200] generate_contacts = [] for mail in mail_list: contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") spliteed = mail.mail.split("@") - possible_name_list = generate_name_from_email(spliteed[0]) + possible_name_list = generate_name_from_email(spliteed[0])[0] + chinese_name_list = generate_name_from_email(spliteed[0])[1] if len(possible_name_list) >= 2: contact.last_name = possible_name_list[0] contact.first_name = "".join(possible_name_list[1:-1]) + contact.note = " ".join(chinese_name_list) if len(contact.last_name) > 0 and len(contact.first_name) > 0: generate_contacts.append(contact) write_new_contacts_to_excel(generate_contacts) From 83b6c91f8093ad221dade74b0b8efd831f6db2cf Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Wed, 19 Apr 2023 20:13:53 +0200 Subject: [PATCH 04/10] try to correct the probleme of multi emails --- src/mail/mail_confirmation.py | 8 ++++--- src/mail/mail_reader.py | 42 ++++++++++++++++++++++++++++++----- src/pojo/mail/mail_pojo.py | 2 ++ src/utils/extract_name.py | 2 +- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/src/mail/mail_confirmation.py b/src/mail/mail_confirmation.py index d239b6a..7b829f5 100644 --- a/src/mail/mail_confirmation.py +++ b/src/mail/mail_confirmation.py @@ -175,13 +175,15 @@ def read_mails_and_find_confirmation_contacts(): for mail in mails_messages: message_body = mail.body for item in successful_items: - if item.id in message_body and item.id != "welcome": + if item.id in message_body and item.id != "welcome" and len(item.id) > 0: item.message = message_body accepted_appointment_list.append(item) - elif "10:30" in message_body and (item.email == mail.mail_address or item.email in message_body): + elif "10:30" in message_body and ( + item.email == mail.mail_address or item.email in message_body) and len(item.id) > 0: item.message = message_body accepted_appointment_list.append(item) - elif "11:30" in message_body and (item.email == mail.mail_address or item.email in message_body): + elif "11:30" in message_body and ( + item.email == mail.mail_address or item.email in message_body) and len(item.id) > 0: item.message = message_body accepted_appointment_list.append(item) print(mail.mail_address) diff --git a/src/mail/mail_reader.py b/src/mail/mail_reader.py index 83c167a..b4eefaf 100644 --- a/src/mail/mail_reader.py +++ b/src/mail/mail_reader.py @@ -20,11 +20,37 @@ VALIDATION_URL_SUBJECT_EN = 'Please confirm your appointment request' VALIDATION_URL_REGEX = """https:\/\/rendezvousparis.hermes.com\/client\/register\/[A-Z0-9]+\/validate.code=[A-Z0-9]+""" PART_VALIDATION_URL_REGEX = """client\/register\/[A-Z0-9]+\/validate.code=[A-Z0-9]+""" HERMES_EMAIL = "no-reply@hermes.com" +EMAIL_ADDRESS_REGEX = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b' date_format = "%d-%b-%Y" # DD-Mon-YYYY e.g., 3-Mar-2014 REDIRECTION_MAILS = "appointment2022@aol.com, chenpeijun@aol.com,hongjiang176@aol.com,ciyuexie@aol.com" +def check_email_address(email): + # pass the regular expression + # and the string into the fullmatch() method + if (re.fullmatch(EMAIL_ADDRESS_REGEX, email)): + print("Valid Email") + return True + else: + print("Invalid Email") + return False + + +def find_from_mail(param): + from_address, encoded_algo = param[0] + if isinstance(from_address, bytes): + from_address = from_address.decode(encoded_algo) + if not check_email_address(from_address) and len(param) == 2: + from_address, new_encode = param[1] + if new_encode is None: + new_encode = encoded_algo + if isinstance(from_address, bytes): + from_address = from_address.decode(new_encode) + return from_address.strip(" ").strip(">").strip("<") + return from_address.strip(" ").strip(">").strip("<") + + class MailReader(): def __init__(self, login, password): self.login = login @@ -92,11 +118,11 @@ class MailReader(): # if it's a bytes, decode to str subject = subject.decode(subject_encoded) # decode email sender - from_address, subject_encoded = decode_header(msg.get("From"))[0] - if isinstance(from_address, bytes): - from_address = from_address.decode(subject_encoded) + from_address = find_from_mail(decode_header(msg.get("From"))) + to_email = find_from_mail(decode_header(msg.get("To"))) print("Email:", self.login) print("From:", from_address) + print("To:", to_email) print("Subject:", subject) # if the email message is multipart if msg.is_multipart(): @@ -117,6 +143,10 @@ class MailReader(): print(body) if VALIDATION_URL_SUBJECT_fr in subject or VALIDATION_URL_SUBJECT_EN in subject: mail = MailPojo(subject=subject, body=body, from_address=from_address) + if to_email is None: + mail.to_address = self.login + else: + mail.to_address = to_email mail.mail_address = self.login mail_messages.append(mail) return mail_messages @@ -158,7 +188,7 @@ class MailReader(): def need_to_valid_url(url: str, successful_items) -> bool: - # return True + return True if len(successful_items) == 0: return False print("url is :" + url) @@ -214,7 +244,7 @@ def read_mails(): if is_time_between(time(7, 30), time(19, 30)): # get email address mail_list = MONGO_STORE_MANAGER.get_destination_emails() - # mail_address1 = MailAddress(mail="Saniremvazhaun@yahoo.com", password="hxwgldifdnuacoyr") + # mail_address1 = MailAddress(mail="appointment2022@aol.com", password="gyilpmvyyvlcaviq") # mail_address1 = MailAddress(mail="chenpeijun@aol.com", password="ytifuwguknzifqyb") # # mail_address3 = MailAddress(mail="ciyuexie@aol.com", password="czezlmmyypokdfce") # mail_list = [mail_address1] @@ -240,7 +270,7 @@ def read_mails(): # else: url = match.group(0) if need_to_valid_url(url, successful_items): - MONGO_STORE_MANAGER.save_links_to_validate(url, mail.mail_address) + MONGO_STORE_MANAGER.save_links_to_validate(url, mail.to_address) # url_validator = LinkValidator(url) print("need to validate url: " + url) # executor.submit(url_validator.start_page, params.get_proxy(ProxyType.OXYLABS), False) diff --git a/src/pojo/mail/mail_pojo.py b/src/pojo/mail/mail_pojo.py index 7df5653..fb054e0 100644 --- a/src/pojo/mail/mail_pojo.py +++ b/src/pojo/mail/mail_pojo.py @@ -22,6 +22,7 @@ class MailAddress: class MailPojo: from_address: str + to_address: str body: str subject: str mail_address: str = "" @@ -32,3 +33,4 @@ class MailPojo: self.subject = subject self.from_address = from_address self.isImapClient = False + self.to_address = "" diff --git a/src/utils/extract_name.py b/src/utils/extract_name.py index 94b3fe2..63c795d 100644 --- a/src/utils/extract_name.py +++ b/src/utils/extract_name.py @@ -90,7 +90,7 @@ def write_new_contacts_to_excel(valid_contacts: list): if __name__ == '__main__': db_manager = MongoDbManager() - mail_list = db_manager.get_destination_emails()[301:500] + mail_list = db_manager.get_destination_emails()[501:1000] # mail_list = db_manager.get_destination_emails()[50:200] generate_contacts = [] for mail in mail_list: From 42e634f34d3f5be6f60d552af6f2062daff8fb72 Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Thu, 20 Apr 2023 00:00:43 +0200 Subject: [PATCH 05/10] extract name from pinyin list --- src/person_name/__init__.py | 0 src/person_name/convert_name_to_pinyin.py | 30 +++++ .../extract_name_with_pinyinlist.py | 110 ++++++++++++++++++ 3 files changed, 140 insertions(+) create mode 100644 src/person_name/__init__.py create mode 100644 src/person_name/convert_name_to_pinyin.py create mode 100644 src/person_name/extract_name_with_pinyinlist.py diff --git a/src/person_name/__init__.py b/src/person_name/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/person_name/convert_name_to_pinyin.py b/src/person_name/convert_name_to_pinyin.py new file mode 100644 index 0000000..cbea248 --- /dev/null +++ b/src/person_name/convert_name_to_pinyin.py @@ -0,0 +1,30 @@ +from pypinyin import pinyin, lazy_pinyin, Style + + +def read_name_from_files_by_line(): + # Using readlines() + file1 = open('/Users/lpan/Downloads/Chinese_Names_Corpus.txt', 'r') + file2 = open('pinyin_list.txt', 'w') + lines = file1.readlines() + count = 0 + # Strips the newline character + for line in lines: + count += 1 + print("Line{}: {}".format(count, line.strip())) + name_to_save = convert_name_to_pinyin(line.strip()) + file2.writelines(name_to_save + "\n") + print(name_to_save) + file1.close() + file2.close() + + +def convert_name_to_pinyin(name: str): + name_in_pinyin_list = lazy_pinyin(name) + true_list = [] + for item in name_in_pinyin_list: + if item != '\ufeff': + true_list.append(item) + return true_list[0] + " " + "".join(true_list[1:len(true_list)]) + + +read_name_from_files_by_line() diff --git a/src/person_name/extract_name_with_pinyinlist.py b/src/person_name/extract_name_with_pinyinlist.py new file mode 100644 index 0000000..abf1b20 --- /dev/null +++ b/src/person_name/extract_name_with_pinyinlist.py @@ -0,0 +1,110 @@ +import itertools + +import xlsxwriter +from pyhanlp import * +from Pinyin2Hanzi import DefaultHmmParams +from Pinyin2Hanzi import viterbi +from itertools import combinations + +from src.db.mongo_manager import MongoDbManager +from src.pojo.contact_pojo import ContactPojo +from src.utils.excel_reader import get_random_phone_numbers, generate_email_from_name +from src.utils.generate_random_passport_id import get_random_passport_id_number + + +def get_ordered_combins(stuff): + list_to_return = [] + for i, j in itertools.combinations(range(len(stuff) + 1), 2): + print(stuff[i:j]) + list_to_return.append(stuff[i:j]) + return list_to_return + + +def get_better_list(list): + return list + # for name in list: + # if len(name) == 2: + # list.remove(name) + # return list + + +# 关键词提取 + +def read_pinyin_list_from_file() -> list: + file2 = open('clean_list.txt', 'r') + lines = file2.readlines() + name_list = [] + count = 0 + for line in lines: + count += 1 + print("Line{}: {}".format(count, line.strip())) + name_list.append(line.strip()) + return name_list + + +def generate_name_from_email(mail_address, pinyin_name_list): + # key_words = HanLP.extractKeyword(mail_address, 2) + # print(key_words) + # setence = "".join(key_words) + all_combins = get_ordered_combins(mail_address) + for i in all_combins: + word_to_test = "".join(i) + if len(word_to_test) >= 3: + # print("word to test is " + word_to_test) + for name in pinyin_name_list: + last_name = name.split(" ")[0] + first_name = name.split(" ")[-1] + if word_to_test in last_name: + return last_name, first_name + elif word_to_test in first_name: + return last_name, first_name + + # 选择不重复的 + # if len(pinyin_name_list) > 3: + # return get_better_list(pinyin_name_list) + # else: + return None + + +def write_new_contacts_to_excel(valid_contacts: list): + row = 0 + col = 0 + # Create a workbook and add a worksheet. + workbook = xlsxwriter.Workbook('real_name_contacts_{}.xlsx'.format(len(valid_contacts))) + header_data = ['name', 'phone', 'passport', 'email', 'note'] + worksheet = workbook.add_worksheet() + header_format = workbook.add_format({'bold': True}) + + for col_num, data in enumerate(header_data): + worksheet.write(row, col_num, data, header_format) + row = row + 1 + for info in valid_contacts: + info.phone = get_random_phone_numbers() + info.passport = get_random_passport_id_number() + # Iterate over the data and write it out row by row. + worksheet.write(row, col, "{} {}".format(info.last_name, info.first_name)) + worksheet.write(row, col + 1, info.phone) + worksheet.write(row, col + 2, info.passport) + worksheet.write(row, col + 3, info.mail) + worksheet.write(row, col + 4, info.note) + row += 1 + workbook.close() + + +if __name__ == '__main__': + db_manager = MongoDbManager() + mail_list = db_manager.get_destination_emails()[1001:1200] + # mail_list = db_manager.get_destination_emails()[50:200] + generate_contacts = [] + pinyin_name_list = read_pinyin_list_from_file() + + for mail in mail_list: + contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") + spliteed = mail.mail.split("@") + possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list) + if possible_name_list is not None: + contact.last_name = possible_name_list[0] + contact.first_name = possible_name_list[1] + if len(contact.last_name) > 0 and len(contact.first_name) > 0: + generate_contacts.append(contact) + write_new_contacts_to_excel(generate_contacts) From c99e55911704869ce67ed64e1e5bc7caaa61c3d5 Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Thu, 20 Apr 2023 11:21:20 +0200 Subject: [PATCH 06/10] filter rdv --- src/mail/mail_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mail/mail_reader.py b/src/mail/mail_reader.py index b4eefaf..a47017a 100644 --- a/src/mail/mail_reader.py +++ b/src/mail/mail_reader.py @@ -188,7 +188,7 @@ class MailReader(): def need_to_valid_url(url: str, successful_items) -> bool: - return True + # return True if len(successful_items) == 0: return False print("url is :" + url) From 1a2c516f49516ce5092b8af791b18dc5ac63bde0 Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Thu, 20 Apr 2023 20:33:16 +0200 Subject: [PATCH 07/10] add extract_name_with_pinyinlist.py --- src/person_name/extract_name_with_pinyinlist.py | 2 +- src/utils/excel_reader.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/person_name/extract_name_with_pinyinlist.py b/src/person_name/extract_name_with_pinyinlist.py index abf1b20..af5c1c3 100644 --- a/src/person_name/extract_name_with_pinyinlist.py +++ b/src/person_name/extract_name_with_pinyinlist.py @@ -93,7 +93,7 @@ def write_new_contacts_to_excel(valid_contacts: list): if __name__ == '__main__': db_manager = MongoDbManager() - mail_list = db_manager.get_destination_emails()[1001:1200] + mail_list = db_manager.get_destination_emails()[1201:1500] # mail_list = db_manager.get_destination_emails()[50:200] generate_contacts = [] pinyin_name_list = read_pinyin_list_from_file() diff --git a/src/utils/excel_reader.py b/src/utils/excel_reader.py index 4989c37..f7eb231 100644 --- a/src/utils/excel_reader.py +++ b/src/utils/excel_reader.py @@ -39,6 +39,7 @@ class ExcelHelper: print(user_agent_list) def read_contacts(self, file_name=CONTACT_LIST_FILE) -> list: + print("read file " + file_name) contact_list_in_json = pandas.read_excel(file_name).to_json(orient='records') contact_dict_list = json.loads(contact_list_in_json) contact_list = [] @@ -60,8 +61,8 @@ class ExcelHelper: contact_list.append(contact) return contact_list - def check_contact_list(self): - contact_list = self.read_contacts() + def check_contact_list(self, file_name=CONTACT_LIST_FILE): + contact_list = self.read_contacts(file_name) for contact in contact_list: if contact.first_name is None or len(contact.first_name) == 0: print("error in firstName for " + contact.mail) @@ -227,12 +228,12 @@ def save_mails_to_db(): if __name__ == '__main__': # excel_reader = ExcelHelper() - # contacts = excel_reader.read_names("/Users/lpan/Downloads/10_rambler_ru_23_03_2023.xlsx") + # contacts = excel_reader.read_names("/Users/lpan/Downloads/gmail_10.xlsx") # print(contacts) # write_new_contacts_to_excel(valid_contacts=contacts) - # excel_reader = ExcelHelper() - # excel_reader.check_contact_list() - save_mails_to_db() + excel_reader = ExcelHelper() + excel_reader.check_contact_list("/Users/lpan/Desktop/contact_email_valid.xlsx") + # save_mails_to_db() # for mail in excel_reader.read_mails_and_pwd(): # MONGO_STORE_MANAGER.insert_email(mail) # for i in range(1, 64): From 3832282500bac377d3ca361fc723ec3326b33247 Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Sun, 23 Apr 2023 17:04:47 +0200 Subject: [PATCH 08/10] generate 2501-2700 --- src/person_name/extract_name_with_pinyinlist.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/person_name/extract_name_with_pinyinlist.py b/src/person_name/extract_name_with_pinyinlist.py index af5c1c3..0b29794 100644 --- a/src/person_name/extract_name_with_pinyinlist.py +++ b/src/person_name/extract_name_with_pinyinlist.py @@ -10,7 +10,7 @@ from src.db.mongo_manager import MongoDbManager from src.pojo.contact_pojo import ContactPojo from src.utils.excel_reader import get_random_phone_numbers, generate_email_from_name from src.utils.generate_random_passport_id import get_random_passport_id_number - +import random def get_ordered_combins(stuff): list_to_return = [] @@ -93,11 +93,12 @@ def write_new_contacts_to_excel(valid_contacts: list): if __name__ == '__main__': db_manager = MongoDbManager() - mail_list = db_manager.get_destination_emails()[1201:1500] + mail_list = db_manager.get_destination_emails()[2501:2700] + # mail_list = db_manager.get_destination_emails()[6800:7000] # mail_list = db_manager.get_destination_emails()[50:200] generate_contacts = [] pinyin_name_list = read_pinyin_list_from_file() - + random.shuffle(pinyin_name_list) for mail in mail_list: contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") spliteed = mail.mail.split("@") From 6476ff1a39127e395a954b56b47109869ae1baab Mon Sep 17 00:00:00 2001 From: Lei PAN Date: Sun, 23 Apr 2023 19:03:20 +0200 Subject: [PATCH 09/10] optimize the imports --- src/person_name/extract_name_with_pinyinlist.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/person_name/extract_name_with_pinyinlist.py b/src/person_name/extract_name_with_pinyinlist.py index 0b29794..feb3a7e 100644 --- a/src/person_name/extract_name_with_pinyinlist.py +++ b/src/person_name/extract_name_with_pinyinlist.py @@ -1,16 +1,13 @@ import itertools +import random import xlsxwriter -from pyhanlp import * -from Pinyin2Hanzi import DefaultHmmParams -from Pinyin2Hanzi import viterbi -from itertools import combinations from src.db.mongo_manager import MongoDbManager from src.pojo.contact_pojo import ContactPojo -from src.utils.excel_reader import get_random_phone_numbers, generate_email_from_name +from src.utils.excel_reader import get_random_phone_numbers from src.utils.generate_random_passport_id import get_random_passport_id_number -import random + def get_ordered_combins(stuff): list_to_return = [] @@ -93,8 +90,8 @@ def write_new_contacts_to_excel(valid_contacts: list): if __name__ == '__main__': db_manager = MongoDbManager() - mail_list = db_manager.get_destination_emails()[2501:2700] - # mail_list = db_manager.get_destination_emails()[6800:7000] + mail_list = db_manager.get_destination_emails()[0:100] + # mail_list = db_manager.get_destination_emails()[3001:3200] # mail_list = db_manager.get_destination_emails()[50:200] generate_contacts = [] pinyin_name_list = read_pinyin_list_from_file() From b23bd24970a4dc7873f49302f9bd94a9cba967f9 Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Fri, 28 Apr 2023 20:21:25 +0200 Subject: [PATCH 10/10] extract new contacts --- src/person_name/extract_name_with_pinyinlist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/person_name/extract_name_with_pinyinlist.py b/src/person_name/extract_name_with_pinyinlist.py index feb3a7e..c7c1025 100644 --- a/src/person_name/extract_name_with_pinyinlist.py +++ b/src/person_name/extract_name_with_pinyinlist.py @@ -90,7 +90,7 @@ def write_new_contacts_to_excel(valid_contacts: list): if __name__ == '__main__': db_manager = MongoDbManager() - mail_list = db_manager.get_destination_emails()[0:100] + mail_list = db_manager.get_destination_emails()[101:1000] # mail_list = db_manager.get_destination_emails()[3001:3200] # mail_list = db_manager.get_destination_emails()[50:200] generate_contacts = []