extract name from pinyin list

This commit is contained in:
2023-04-20 00:00:43 +02:00
parent 83b6c91f80
commit 42e634f34d
3 changed files with 140 additions and 0 deletions
View File
+30
View File
@@ -0,0 +1,30 @@
from pypinyin import pinyin, lazy_pinyin, Style
def read_name_from_files_by_line():
# Using readlines()
file1 = open('/Users/lpan/Downloads/Chinese_Names_Corpus.txt', 'r')
file2 = open('pinyin_list.txt', 'w')
lines = file1.readlines()
count = 0
# Strips the newline character
for line in lines:
count += 1
print("Line{}: {}".format(count, line.strip()))
name_to_save = convert_name_to_pinyin(line.strip())
file2.writelines(name_to_save + "\n")
print(name_to_save)
file1.close()
file2.close()
def convert_name_to_pinyin(name: str):
name_in_pinyin_list = lazy_pinyin(name)
true_list = []
for item in name_in_pinyin_list:
if item != '\ufeff':
true_list.append(item)
return true_list[0] + " " + "".join(true_list[1:len(true_list)])
read_name_from_files_by_line()
@@ -0,0 +1,110 @@
import itertools
import xlsxwriter
from pyhanlp import *
from Pinyin2Hanzi import DefaultHmmParams
from Pinyin2Hanzi import viterbi
from itertools import combinations
from src.db.mongo_manager import MongoDbManager
from src.pojo.contact_pojo import ContactPojo
from src.utils.excel_reader import get_random_phone_numbers, generate_email_from_name
from src.utils.generate_random_passport_id import get_random_passport_id_number
def get_ordered_combins(stuff):
list_to_return = []
for i, j in itertools.combinations(range(len(stuff) + 1), 2):
print(stuff[i:j])
list_to_return.append(stuff[i:j])
return list_to_return
def get_better_list(list):
return list
# for name in list:
# if len(name) == 2:
# list.remove(name)
# return list
# 关键词提取
def read_pinyin_list_from_file() -> list:
file2 = open('clean_list.txt', 'r')
lines = file2.readlines()
name_list = []
count = 0
for line in lines:
count += 1
print("Line{}: {}".format(count, line.strip()))
name_list.append(line.strip())
return name_list
def generate_name_from_email(mail_address, pinyin_name_list):
# key_words = HanLP.extractKeyword(mail_address, 2)
# print(key_words)
# setence = "".join(key_words)
all_combins = get_ordered_combins(mail_address)
for i in all_combins:
word_to_test = "".join(i)
if len(word_to_test) >= 3:
# print("word to test is " + word_to_test)
for name in pinyin_name_list:
last_name = name.split(" ")[0]
first_name = name.split(" ")[-1]
if word_to_test in last_name:
return last_name, first_name
elif word_to_test in first_name:
return last_name, first_name
# 选择不重复的
# if len(pinyin_name_list) > 3:
# return get_better_list(pinyin_name_list)
# else:
return None
def write_new_contacts_to_excel(valid_contacts: list):
row = 0
col = 0
# Create a workbook and add a worksheet.
workbook = xlsxwriter.Workbook('real_name_contacts_{}.xlsx'.format(len(valid_contacts)))
header_data = ['name', 'phone', 'passport', 'email', 'note']
worksheet = workbook.add_worksheet()
header_format = workbook.add_format({'bold': True})
for col_num, data in enumerate(header_data):
worksheet.write(row, col_num, data, header_format)
row = row + 1
for info in valid_contacts:
info.phone = get_random_phone_numbers()
info.passport = get_random_passport_id_number()
# Iterate over the data and write it out row by row.
worksheet.write(row, col, "{} {}".format(info.last_name, info.first_name))
worksheet.write(row, col + 1, info.phone)
worksheet.write(row, col + 2, info.passport)
worksheet.write(row, col + 3, info.mail)
worksheet.write(row, col + 4, info.note)
row += 1
workbook.close()
if __name__ == '__main__':
db_manager = MongoDbManager()
mail_list = db_manager.get_destination_emails()[1001:1200]
# mail_list = db_manager.get_destination_emails()[50:200]
generate_contacts = []
pinyin_name_list = read_pinyin_list_from_file()
for mail in mail_list:
contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="")
spliteed = mail.mail.split("@")
possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list)
if possible_name_list is not None:
contact.last_name = possible_name_list[0]
contact.first_name = possible_name_list[1]
if len(contact.last_name) > 0 and len(contact.first_name) > 0:
generate_contacts.append(contact)
write_new_contacts_to_excel(generate_contacts)