From 7bbdf6ca9cf46622abd2f58f9366dc2196762860 Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Tue, 18 Apr 2023 20:23:14 +0200 Subject: [PATCH] add note in the excel --- src/pojo/contact_pojo.py | 2 ++ src/utils/extract_name.py | 39 +++++++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/pojo/contact_pojo.py b/src/pojo/contact_pojo.py index 3e1fe02..30cb475 100644 --- a/src/pojo/contact_pojo.py +++ b/src/pojo/contact_pojo.py @@ -12,6 +12,7 @@ class ContactPojo: mail: str ccid: str position: int + note: str def __init__(self, phone_number: str, passport_number: str, last_name: str, first_name: str, mail: str, ccid: str = "", @@ -23,6 +24,7 @@ class ContactPojo: self.ccid = ccid self.mail = mail self.position = position + self.note = "" def to_firestore_dict(self): dest = { diff --git a/src/utils/extract_name.py b/src/utils/extract_name.py index ea47ef8..94b3fe2 100644 --- a/src/utils/extract_name.py +++ b/src/utils/extract_name.py @@ -21,10 +21,11 @@ def get_ordered_combins(stuff): def get_better_list(list): - for name in list: - if len(name) == 2: - list.remove(name) - return list + return list + # for name in list: + # if len(name) == 2: + # list.remove(name) + # return list # 关键词提取 @@ -34,7 +35,8 @@ def generate_name_from_email(mail_address): key_words = HanLP.extractKeyword(mail_address, 2) print(key_words) hmmparams = DefaultHmmParams() - possible_name_list = [] + pinyin_name_list = [] + chinese_name_list = [] setence = "".join(key_words) all_combins = get_ordered_combins(setence) for i in all_combins: @@ -45,19 +47,20 @@ def generate_name_from_email(mail_address): result = viterbi(hmm_params=hmmparams, observations=(word_to_test,), path_num=2) for item in result: print(item.score, item.path) + chinese_name_list.extend(item.path) # print("word is " + word_to_test) - if len(word_to_test) >= 3: - all_combins.remove(i) - possible_name_list.append(word_to_test) + # if len(word_to_test) >= 3: + # all_combins.remove(i) + pinyin_name_list.append(word_to_test) except Exception as error: print(error) - print(possible_name_list) + print(pinyin_name_list) # 选择不重复的 - if len(possible_name_list) > 3: - return get_better_list(possible_name_list) - else: - return possible_name_list + # if len(pinyin_name_list) > 3: + # return get_better_list(pinyin_name_list) + # else: + return pinyin_name_list, chinese_name_list def write_new_contacts_to_excel(valid_contacts: list): @@ -65,7 +68,7 @@ def write_new_contacts_to_excel(valid_contacts: list): col = 0 # Create a workbook and add a worksheet. workbook = xlsxwriter.Workbook('real_name_contacts_{}.xlsx'.format(len(valid_contacts))) - header_data = ['name', 'phone', 'passport', 'email'] + header_data = ['name', 'phone', 'passport', 'email', 'note'] worksheet = workbook.add_worksheet() header_format = workbook.add_format({'bold': True}) @@ -80,21 +83,25 @@ def write_new_contacts_to_excel(valid_contacts: list): worksheet.write(row, col + 1, info.phone) worksheet.write(row, col + 2, info.passport) worksheet.write(row, col + 3, info.mail) + worksheet.write(row, col + 4, info.note) row += 1 workbook.close() if __name__ == '__main__': db_manager = MongoDbManager() - mail_list = db_manager.get_destination_emails()[50:200] + mail_list = db_manager.get_destination_emails()[301:500] + # mail_list = db_manager.get_destination_emails()[50:200] generate_contacts = [] for mail in mail_list: contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") spliteed = mail.mail.split("@") - possible_name_list = generate_name_from_email(spliteed[0]) + possible_name_list = generate_name_from_email(spliteed[0])[0] + chinese_name_list = generate_name_from_email(spliteed[0])[1] if len(possible_name_list) >= 2: contact.last_name = possible_name_list[0] contact.first_name = "".join(possible_name_list[1:-1]) + contact.note = " ".join(chinese_name_list) if len(contact.last_name) > 0 and len(contact.first_name) > 0: generate_contacts.append(contact) write_new_contacts_to_excel(generate_contacts)