From 48bd46f08e98be1db978f4a3947ad02938a223d2 Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Sat, 15 Apr 2023 22:35:22 +0200 Subject: [PATCH] try to extract name from email address --- src/utils/extract_name.py | 100 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 src/utils/extract_name.py diff --git a/src/utils/extract_name.py b/src/utils/extract_name.py new file mode 100644 index 0000000..b04bde5 --- /dev/null +++ b/src/utils/extract_name.py @@ -0,0 +1,100 @@ +import itertools + +import xlsxwriter +from pyhanlp import * +from Pinyin2Hanzi import DefaultHmmParams +from Pinyin2Hanzi import viterbi +from itertools import combinations + +from src.db.mongo_manager import MongoDbManager +from src.pojo.contact_pojo import ContactPojo +from src.utils.excel_reader import get_random_phone_numbers, generate_email_from_name +from src.utils.generate_random_passport_id import get_random_passport_id_number + + +def get_ordered_combins(stuff): + list_to_return = [] + for i, j in itertools.combinations(range(len(stuff) + 1), 2): + print(stuff[i:j]) + list_to_return.append(stuff[i:j]) + return list_to_return + + +def get_better_list(list): + for name in list: + if len(name) == 2: + list.remove(name) + return list + + +# 关键词提取 + + +def generate_name_from_email(mail_address): + key_words = HanLP.extractKeyword(mail_address, 2) + print(key_words) + hmmparams = DefaultHmmParams() + possible_name_list = [] + setence = "".join(key_words) + all_combins = get_ordered_combins(setence) + for i in all_combins: + word_to_test = "".join(i) + if len(word_to_test) >= 2: + # print("word to test is " + word_to_test) + try: + result = viterbi(hmm_params=hmmparams, observations=(word_to_test,), path_num=2) + for item in result: + print(item.score, item.path) + # print("word is " + word_to_test) + if len(word_to_test) >= 3: + all_combins.remove(i) + possible_name_list.append(word_to_test) + except Exception as error: + print(error) + + print(possible_name_list) + # 选择不重复的 + if len(possible_name_list) > 3: + return get_better_list(possible_name_list) + else: + return possible_name_list + + +def write_new_contacts_to_excel(valid_contacts: list): + row = 0 + col = 0 + # Create a workbook and add a worksheet. + workbook = xlsxwriter.Workbook('real_name_contacts_{}.xlsx'.format(len(valid_contacts))) + header_data = ['name', 'phone', 'passport', 'email'] + worksheet = workbook.add_worksheet() + header_format = workbook.add_format({'bold': True}) + + for col_num, data in enumerate(header_data): + worksheet.write(row, col_num, data, header_format) + row = row + 1 + for info in valid_contacts: + info.phone = get_random_phone_numbers() + info.passport = get_random_passport_id_number() + # Iterate over the data and write it out row by row. + worksheet.write(row, col, "{} {}".format(info.last_name, info.first_name)) + worksheet.write(row, col + 1, info.phone) + worksheet.write(row, col + 2, info.passport) + worksheet.write(row, col + 3, info.mail) + row += 1 + workbook.close() + + +if __name__ == '__main__': + db_manager = MongoDbManager() + mail_list = db_manager.get_destination_emails()[0:50] + generate_contacts = [] + for mail in mail_list: + contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") + spliteed = mail.mail.split("@") + possible_name_list = generate_name_from_email(spliteed[0]) + if len(possible_name_list) >= 2: + contact.last_name = possible_name_list[0] + contact.first_name = "".join(possible_name_list[1:-1]) + if len(contact.last_name) > 0 and len(contact.first_name) > 0: + generate_contacts.append(contact) + write_new_contacts_to_excel(generate_contacts)