diff --git a/src/person_name/__init__.py b/src/person_name/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/person_name/convert_name_to_pinyin.py b/src/person_name/convert_name_to_pinyin.py new file mode 100644 index 0000000..cbea248 --- /dev/null +++ b/src/person_name/convert_name_to_pinyin.py @@ -0,0 +1,30 @@ +from pypinyin import pinyin, lazy_pinyin, Style + + +def read_name_from_files_by_line(): + # Using readlines() + file1 = open('/Users/lpan/Downloads/Chinese_Names_Corpus.txt', 'r') + file2 = open('pinyin_list.txt', 'w') + lines = file1.readlines() + count = 0 + # Strips the newline character + for line in lines: + count += 1 + print("Line{}: {}".format(count, line.strip())) + name_to_save = convert_name_to_pinyin(line.strip()) + file2.writelines(name_to_save + "\n") + print(name_to_save) + file1.close() + file2.close() + + +def convert_name_to_pinyin(name: str): + name_in_pinyin_list = lazy_pinyin(name) + true_list = [] + for item in name_in_pinyin_list: + if item != '\ufeff': + true_list.append(item) + return true_list[0] + " " + "".join(true_list[1:len(true_list)]) + + +read_name_from_files_by_line() diff --git a/src/person_name/extract_name_with_pinyinlist.py b/src/person_name/extract_name_with_pinyinlist.py new file mode 100644 index 0000000..abf1b20 --- /dev/null +++ b/src/person_name/extract_name_with_pinyinlist.py @@ -0,0 +1,110 @@ +import itertools + +import xlsxwriter +from pyhanlp import * +from Pinyin2Hanzi import DefaultHmmParams +from Pinyin2Hanzi import viterbi +from itertools import combinations + +from src.db.mongo_manager import MongoDbManager +from src.pojo.contact_pojo import ContactPojo +from src.utils.excel_reader import get_random_phone_numbers, generate_email_from_name +from src.utils.generate_random_passport_id import get_random_passport_id_number + + +def get_ordered_combins(stuff): + list_to_return = [] + for i, j in itertools.combinations(range(len(stuff) + 1), 2): + print(stuff[i:j]) + list_to_return.append(stuff[i:j]) + return list_to_return + + +def get_better_list(list): + return list + # for name in list: + # if len(name) == 2: + # list.remove(name) + # return list + + +# 关键词提取 + +def read_pinyin_list_from_file() -> list: + file2 = open('clean_list.txt', 'r') + lines = file2.readlines() + name_list = [] + count = 0 + for line in lines: + count += 1 + print("Line{}: {}".format(count, line.strip())) + name_list.append(line.strip()) + return name_list + + +def generate_name_from_email(mail_address, pinyin_name_list): + # key_words = HanLP.extractKeyword(mail_address, 2) + # print(key_words) + # setence = "".join(key_words) + all_combins = get_ordered_combins(mail_address) + for i in all_combins: + word_to_test = "".join(i) + if len(word_to_test) >= 3: + # print("word to test is " + word_to_test) + for name in pinyin_name_list: + last_name = name.split(" ")[0] + first_name = name.split(" ")[-1] + if word_to_test in last_name: + return last_name, first_name + elif word_to_test in first_name: + return last_name, first_name + + # 选择不重复的 + # if len(pinyin_name_list) > 3: + # return get_better_list(pinyin_name_list) + # else: + return None + + +def write_new_contacts_to_excel(valid_contacts: list): + row = 0 + col = 0 + # Create a workbook and add a worksheet. + workbook = xlsxwriter.Workbook('real_name_contacts_{}.xlsx'.format(len(valid_contacts))) + header_data = ['name', 'phone', 'passport', 'email', 'note'] + worksheet = workbook.add_worksheet() + header_format = workbook.add_format({'bold': True}) + + for col_num, data in enumerate(header_data): + worksheet.write(row, col_num, data, header_format) + row = row + 1 + for info in valid_contacts: + info.phone = get_random_phone_numbers() + info.passport = get_random_passport_id_number() + # Iterate over the data and write it out row by row. + worksheet.write(row, col, "{} {}".format(info.last_name, info.first_name)) + worksheet.write(row, col + 1, info.phone) + worksheet.write(row, col + 2, info.passport) + worksheet.write(row, col + 3, info.mail) + worksheet.write(row, col + 4, info.note) + row += 1 + workbook.close() + + +if __name__ == '__main__': + db_manager = MongoDbManager() + mail_list = db_manager.get_destination_emails()[1001:1200] + # mail_list = db_manager.get_destination_emails()[50:200] + generate_contacts = [] + pinyin_name_list = read_pinyin_list_from_file() + + for mail in mail_list: + contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") + spliteed = mail.mail.split("@") + possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list) + if possible_name_list is not None: + contact.last_name = possible_name_list[0] + contact.first_name = possible_name_list[1] + if len(contact.last_name) > 0 and len(contact.first_name) > 0: + generate_contacts.append(contact) + write_new_contacts_to_excel(generate_contacts)