提取名字时,修改为需要5个连续字符
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
import itertools
|
import itertools
|
||||||
import random
|
import random
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
import xlsxwriter
|
import xlsxwriter
|
||||||
|
|
||||||
@@ -11,7 +12,7 @@ from src.utils.generate_random_passport_id import get_random_passport_id_number
|
|||||||
|
|
||||||
def get_ordered_combins(stuff):
|
def get_ordered_combins(stuff):
|
||||||
list_to_return = []
|
list_to_return = []
|
||||||
for i, j in itertools.combinations(range(len(stuff) + 1), 2):
|
for i, j in itertools.combinations(range(len(stuff) + 3), 2):
|
||||||
print(stuff[i:j])
|
print(stuff[i:j])
|
||||||
list_to_return.append(stuff[i:j])
|
list_to_return.append(stuff[i:j])
|
||||||
return list_to_return
|
return list_to_return
|
||||||
@@ -43,18 +44,25 @@ def generate_name_from_email(mail_address, pinyin_name_list):
|
|||||||
# key_words = HanLP.extractKeyword(mail_address, 2)
|
# key_words = HanLP.extractKeyword(mail_address, 2)
|
||||||
# print(key_words)
|
# print(key_words)
|
||||||
# setence = "".join(key_words)
|
# setence = "".join(key_words)
|
||||||
|
print("generate for " + mail_address)
|
||||||
all_combins = get_ordered_combins(mail_address)
|
all_combins = get_ordered_combins(mail_address)
|
||||||
for i in all_combins:
|
for i in all_combins:
|
||||||
word_to_test = "".join(i)
|
word_to_test = "".join(i)
|
||||||
if len(word_to_test) >= 3:
|
if len(word_to_test) >= 5:
|
||||||
# print("word to test is " + word_to_test)
|
# print("word to test is " + word_to_test)
|
||||||
for name in pinyin_name_list:
|
for name in pinyin_name_list:
|
||||||
last_name = name.split(" ")[0]
|
last_name = name.split(" ")[0]
|
||||||
first_name = name.split(" ")[-1]
|
first_name = name.split(" ")[-1]
|
||||||
|
full_name = last_name + first_name
|
||||||
|
full_name_inverse = first_name + last_name
|
||||||
if word_to_test in last_name:
|
if word_to_test in last_name:
|
||||||
return last_name, first_name
|
return last_name, first_name
|
||||||
elif word_to_test in first_name:
|
elif word_to_test in first_name:
|
||||||
return last_name, first_name
|
return last_name, first_name
|
||||||
|
elif word_to_test in full_name:
|
||||||
|
return last_name, first_name
|
||||||
|
elif word_to_test in full_name_inverse:
|
||||||
|
return last_name, first_name
|
||||||
|
|
||||||
# 选择不重复的
|
# 选择不重复的
|
||||||
# if len(pinyin_name_list) > 3:
|
# if len(pinyin_name_list) > 3:
|
||||||
@@ -88,21 +96,28 @@ def write_new_contacts_to_excel(valid_contacts: list):
|
|||||||
workbook.close()
|
workbook.close()
|
||||||
|
|
||||||
|
|
||||||
|
def find_contact(generate_contacts: list, mail):
|
||||||
|
contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="")
|
||||||
|
spliteed = mail.mail.split("@")
|
||||||
|
possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list)
|
||||||
|
if possible_name_list is not None:
|
||||||
|
contact.last_name = possible_name_list[0]
|
||||||
|
contact.first_name = possible_name_list[1]
|
||||||
|
if len(contact.last_name) > 0 and len(contact.first_name) > 0:
|
||||||
|
generate_contacts.append(contact)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
db_manager = MongoDbManager()
|
db_manager = MongoDbManager()
|
||||||
mail_list = db_manager.get_destination_emails()[101:1000]
|
mail_list = db_manager.get_destination_emails()[5000:6000]
|
||||||
# mail_list = db_manager.get_destination_emails()[3001:3200]
|
# mail_list = db_manager.get_destination_emails()[3001:3200]
|
||||||
# mail_list = db_manager.get_destination_emails()[50:200]
|
# mail_list = db_manager.get_destination_emails()[50:200]
|
||||||
generate_contacts = []
|
generate_contacts = []
|
||||||
pinyin_name_list = read_pinyin_list_from_file()
|
pinyin_name_list = read_pinyin_list_from_file()
|
||||||
random.shuffle(pinyin_name_list)
|
random.shuffle(pinyin_name_list)
|
||||||
for mail in mail_list:
|
with ThreadPoolExecutor(max_workers=200) as executor:
|
||||||
contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="")
|
for mail in mail_list:
|
||||||
spliteed = mail.mail.split("@")
|
executor.submit(find_contact, generate_contacts, mail)
|
||||||
possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list)
|
# for mail in mail_list:
|
||||||
if possible_name_list is not None:
|
# find_contact(generate_contacts, mail)
|
||||||
contact.last_name = possible_name_list[0]
|
|
||||||
contact.first_name = possible_name_list[1]
|
|
||||||
if len(contact.last_name) > 0 and len(contact.first_name) > 0:
|
|
||||||
generate_contacts.append(contact)
|
|
||||||
write_new_contacts_to_excel(generate_contacts)
|
write_new_contacts_to_excel(generate_contacts)
|
||||||
|
|||||||
Reference in New Issue
Block a user