提取名字时,修改为需要5个连续字符

This commit is contained in:
2023-05-05 10:28:15 +02:00
parent 7625771b27
commit 4edecff99c
+27 -12
View File
@@ -1,5 +1,6 @@
import itertools import itertools
import random import random
from concurrent.futures import ThreadPoolExecutor
import xlsxwriter import xlsxwriter
@@ -11,7 +12,7 @@ from src.utils.generate_random_passport_id import get_random_passport_id_number
def get_ordered_combins(stuff): def get_ordered_combins(stuff):
list_to_return = [] list_to_return = []
for i, j in itertools.combinations(range(len(stuff) + 1), 2): for i, j in itertools.combinations(range(len(stuff) + 3), 2):
print(stuff[i:j]) print(stuff[i:j])
list_to_return.append(stuff[i:j]) list_to_return.append(stuff[i:j])
return list_to_return return list_to_return
@@ -43,18 +44,25 @@ def generate_name_from_email(mail_address, pinyin_name_list):
# key_words = HanLP.extractKeyword(mail_address, 2) # key_words = HanLP.extractKeyword(mail_address, 2)
# print(key_words) # print(key_words)
# setence = "".join(key_words) # setence = "".join(key_words)
print("generate for " + mail_address)
all_combins = get_ordered_combins(mail_address) all_combins = get_ordered_combins(mail_address)
for i in all_combins: for i in all_combins:
word_to_test = "".join(i) word_to_test = "".join(i)
if len(word_to_test) >= 3: if len(word_to_test) >= 5:
# print("word to test is " + word_to_test) # print("word to test is " + word_to_test)
for name in pinyin_name_list: for name in pinyin_name_list:
last_name = name.split(" ")[0] last_name = name.split(" ")[0]
first_name = name.split(" ")[-1] first_name = name.split(" ")[-1]
full_name = last_name + first_name
full_name_inverse = first_name + last_name
if word_to_test in last_name: if word_to_test in last_name:
return last_name, first_name return last_name, first_name
elif word_to_test in first_name: elif word_to_test in first_name:
return last_name, first_name return last_name, first_name
elif word_to_test in full_name:
return last_name, first_name
elif word_to_test in full_name_inverse:
return last_name, first_name
# 选择不重复的 # 选择不重复的
# if len(pinyin_name_list) > 3: # if len(pinyin_name_list) > 3:
@@ -88,21 +96,28 @@ def write_new_contacts_to_excel(valid_contacts: list):
workbook.close() workbook.close()
def find_contact(generate_contacts: list, mail):
contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="")
spliteed = mail.mail.split("@")
possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list)
if possible_name_list is not None:
contact.last_name = possible_name_list[0]
contact.first_name = possible_name_list[1]
if len(contact.last_name) > 0 and len(contact.first_name) > 0:
generate_contacts.append(contact)
if __name__ == '__main__': if __name__ == '__main__':
db_manager = MongoDbManager() db_manager = MongoDbManager()
mail_list = db_manager.get_destination_emails()[101:1000] mail_list = db_manager.get_destination_emails()[5000:6000]
# mail_list = db_manager.get_destination_emails()[3001:3200] # mail_list = db_manager.get_destination_emails()[3001:3200]
# mail_list = db_manager.get_destination_emails()[50:200] # mail_list = db_manager.get_destination_emails()[50:200]
generate_contacts = [] generate_contacts = []
pinyin_name_list = read_pinyin_list_from_file() pinyin_name_list = read_pinyin_list_from_file()
random.shuffle(pinyin_name_list) random.shuffle(pinyin_name_list)
for mail in mail_list: with ThreadPoolExecutor(max_workers=200) as executor:
contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") for mail in mail_list:
spliteed = mail.mail.split("@") executor.submit(find_contact, generate_contacts, mail)
possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list) # for mail in mail_list:
if possible_name_list is not None: # find_contact(generate_contacts, mail)
contact.last_name = possible_name_list[0]
contact.first_name = possible_name_list[1]
if len(contact.last_name) > 0 and len(contact.first_name) > 0:
generate_contacts.append(contact)
write_new_contacts_to_excel(generate_contacts) write_new_contacts_to_excel(generate_contacts)