From 4edecff99c90dd86ed67cae99fec965688396990 Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Fri, 5 May 2023 10:28:15 +0200 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E5=8F=96=E5=90=8D=E5=AD=97=E6=97=B6?= =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E6=94=B9=E4=B8=BA=E9=9C=80=E8=A6=815?= =?UTF-8?q?=E4=B8=AA=E8=BF=9E=E7=BB=AD=E5=AD=97=E7=AC=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../extract_name_with_pinyinlist.py | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/src/person_name/extract_name_with_pinyinlist.py b/src/person_name/extract_name_with_pinyinlist.py index c7c1025..908aef4 100644 --- a/src/person_name/extract_name_with_pinyinlist.py +++ b/src/person_name/extract_name_with_pinyinlist.py @@ -1,5 +1,6 @@ import itertools import random +from concurrent.futures import ThreadPoolExecutor import xlsxwriter @@ -11,7 +12,7 @@ from src.utils.generate_random_passport_id import get_random_passport_id_number def get_ordered_combins(stuff): list_to_return = [] - for i, j in itertools.combinations(range(len(stuff) + 1), 2): + for i, j in itertools.combinations(range(len(stuff) + 3), 2): print(stuff[i:j]) list_to_return.append(stuff[i:j]) return list_to_return @@ -43,18 +44,25 @@ def generate_name_from_email(mail_address, pinyin_name_list): # key_words = HanLP.extractKeyword(mail_address, 2) # print(key_words) # setence = "".join(key_words) + print("generate for " + mail_address) all_combins = get_ordered_combins(mail_address) for i in all_combins: word_to_test = "".join(i) - if len(word_to_test) >= 3: + if len(word_to_test) >= 5: # print("word to test is " + word_to_test) for name in pinyin_name_list: last_name = name.split(" ")[0] first_name = name.split(" ")[-1] + full_name = last_name + first_name + full_name_inverse = first_name + last_name if word_to_test in last_name: return last_name, first_name elif word_to_test in first_name: return last_name, first_name + elif word_to_test in full_name: + return last_name, first_name + elif word_to_test in full_name_inverse: + return last_name, first_name # 选择不重复的 # if len(pinyin_name_list) > 3: @@ -88,21 +96,28 @@ def write_new_contacts_to_excel(valid_contacts: list): workbook.close() +def find_contact(generate_contacts: list, mail): + contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") + spliteed = mail.mail.split("@") + possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list) + if possible_name_list is not None: + contact.last_name = possible_name_list[0] + contact.first_name = possible_name_list[1] + if len(contact.last_name) > 0 and len(contact.first_name) > 0: + generate_contacts.append(contact) + + if __name__ == '__main__': db_manager = MongoDbManager() - mail_list = db_manager.get_destination_emails()[101:1000] + mail_list = db_manager.get_destination_emails()[5000:6000] # mail_list = db_manager.get_destination_emails()[3001:3200] # mail_list = db_manager.get_destination_emails()[50:200] generate_contacts = [] pinyin_name_list = read_pinyin_list_from_file() random.shuffle(pinyin_name_list) - for mail in mail_list: - contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") - spliteed = mail.mail.split("@") - possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list) - if possible_name_list is not None: - contact.last_name = possible_name_list[0] - contact.first_name = possible_name_list[1] - if len(contact.last_name) > 0 and len(contact.first_name) > 0: - generate_contacts.append(contact) + with ThreadPoolExecutor(max_workers=200) as executor: + for mail in mail_list: + executor.submit(find_contact, generate_contacts, mail) + # for mail in mail_list: + # find_contact(generate_contacts, mail) write_new_contacts_to_excel(generate_contacts)