From dee9c0a8ce573e0b07f60e98a3e9934bed6cd807 Mon Sep 17 00:00:00 2001 From: Lei PAN Date: Sun, 7 Jan 2024 21:13:13 +0100 Subject: [PATCH] performance optimization --- .../extract_name_with_pinyinlist.py | 55 +++++++++++++++---- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/src/person_name/extract_name_with_pinyinlist.py b/src/person_name/extract_name_with_pinyinlist.py index 37834a2..4dc9785 100755 --- a/src/person_name/extract_name_with_pinyinlist.py +++ b/src/person_name/extract_name_with_pinyinlist.py @@ -6,7 +6,6 @@ import xlsxwriter from src.db.mongo_manager import MongoDbManager from src.pojo.contact_pojo import ContactPojo -from src.utils import excel_reader from src.utils.excel_reader import get_random_fr_phone_numbers, ExcelHelper from src.utils.contacts.generate_random_passport_id import get_random_passport_id_number @@ -81,6 +80,27 @@ def generate_new_list_from_old_name_list(): f.close() +def get_maximum_length(): + all_last_name = read_pinyin_last_name_from_file() + all_first_name = read_pinyin_first_name_from_file() + # for name in all_name_list: + # last_name = name.split(" ")[0] + # first_name = name.split(" ")[-1] + # all_last_name.append(last_name) + # all_first_name.append(first_name) + max_last_name_lenghth = 0 + max_first_name_lenghth = 0 + + for last_name in all_last_name: + if len(last_name) > max_last_name_lenghth: + max_last_name_lenghth = len(last_name) + for first_name in all_first_name: + if len(first_name) > max_first_name_lenghth: + max_first_name_lenghth = len(first_name) + print("max_last_name_lenghth :" + str(max_last_name_lenghth)) + print("max_first_name_lenghth :" + str(max_first_name_lenghth)) + + def has_numbers(inputString): return any(char.isdigit() for char in inputString) @@ -150,7 +170,7 @@ def generate_name_from_email(mail_address, pinyin_name_list): # found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list) # if found_name is not None: # return found_name - if len(word_to_test) >= 5: + if 5 <= len(word_to_test) <= 18: found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list) if found_name is not None: return found_name @@ -167,7 +187,7 @@ def write_new_contacts_to_excel(valid_contacts: list): col = 0 # Create a workbook and add a worksheet. workbook = xlsxwriter.Workbook('real_name_contacts_{}.xlsx'.format(len(valid_contacts))) - header_data = ['name', 'phone', 'passport', 'email', 'note'] + header_data = ['name', 'phone', 'passport', 'email', 'store'] worksheet = workbook.add_worksheet() header_format = workbook.add_format({'bold': True}) @@ -182,7 +202,7 @@ def write_new_contacts_to_excel(valid_contacts: list): worksheet.write(row, col + 1, info.phone) worksheet.write(row, col + 2, info.passport) worksheet.write(row, col + 3, info.mail) - worksheet.write(row, col + 4, info.note) + worksheet.write(row, col + 4, "random") row += 1 workbook.close() @@ -198,13 +218,11 @@ def find_contact(generate_contacts: list, mail, pinyin_name_list): generate_contacts.append(contact) -def generate_contact_from_mail_list(): +def generate_contact_from_mail_list(_start_position, _end_position): db_manager = MongoDbManager() - # mail_list = db_manager.get_destination_emails()[1:500] - # mail_list = db_manager.get_destination_emails()[701:900] - # mail_list = db_manager.get_destination_emails()[901:1100] - # mail_list = db_manager.get_destination_emails()[3201:3400] - mail_list = db_manager.get_destination_emails()[3401:3600] + # mail_list = db_manager.get_destination_emails()[8501:8520] + # mail_list = db_manager.get_destination_emails()[8521:8600] + mail_list = db_manager.get_destination_emails()[_start_position:_end_position] # mail_list = db_manager.get_destination_emails()[9323:9914] # excel_reader = ExcelHelper() # mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/toExtract.xlsx") @@ -220,5 +238,20 @@ def generate_contact_from_mail_list(): if __name__ == '__main__': - generate_contact_from_mail_list() + start_position = 8701 + end_position = 8802 + # x = range(start_position, end_position, 100) + step = 100 + step_list = range(start_position, end_position, step) + print(step_list[-1]) + for x in step_list: + print(x) + _from_position = x + _end_position = x + step + if _end_position <= step_list[-1]: + print("start extraction from {} to {}".format(_from_position, _end_position)) + generate_contact_from_mail_list(_from_position, _end_position) + else: + print("stop with {},{}".format(_from_position, _end_position)) + # get_maximum_length() # generate_new_list_from_old_name_list()