From 0e9221db249def1d1ed2b33221a038c0d01fbeae Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Sat, 13 Apr 2024 00:31:02 +0200 Subject: [PATCH 1/3] add created_at field for ip information --- src/utils/contacts/generate_ip_with_contact.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/utils/contacts/generate_ip_with_contact.py b/src/utils/contacts/generate_ip_with_contact.py index 6eb1a5a..8fd456d 100644 --- a/src/utils/contacts/generate_ip_with_contact.py +++ b/src/utils/contacts/generate_ip_with_contact.py @@ -18,6 +18,7 @@ def get_contact_list() -> list: _contact = ContactPojo(first_name=item.first_name, last_name=item.last_name, mail=item.mail, phone_number=item.phone, passport_number=item.passport, store=item.store) _contact.ip_address = item.ip_address + _contact.created_at = item.created_at _contact_list.append(_contact) return _contact_list @@ -39,7 +40,7 @@ def write_contact_with_ip_info_to_file(contact_list): col = 0 # Create a workbook and add a worksheet. workbook = xlsxwriter.Workbook('ip_info_{}.xlsx'.format(len(contact_list))) - header_data = ['name', 'email', 'isp', 'ip_address'] + header_data = ['name', 'email', 'isp', 'ip_address', 'created_at'] worksheet = workbook.add_worksheet() header_format = workbook.add_format({'bold': True}) @@ -52,6 +53,7 @@ def write_contact_with_ip_info_to_file(contact_list): worksheet.write(row, col + 1, info.mail) worksheet.write(row, col + 2, info.isp) worksheet.write(row, col + 3, info.ip_address) + worksheet.write(row, col + 4, info.created_at) row += 1 workbook.close() From c7713079a4c2090daadcb37bf7f92dc49114d2bd Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Sun, 5 May 2024 17:03:06 +0200 Subject: [PATCH 2/3] optimization with cython --- setup.py | 6 + src/person_name/cython_extract_methods.pyx | 192 ++++++++++++++++++ .../extract_name_with_pinyinlist.py | 30 +-- src/person_name/start_extraction.py | 22 ++ 4 files changed, 231 insertions(+), 19 deletions(-) create mode 100644 setup.py create mode 100755 src/person_name/cython_extract_methods.pyx create mode 100644 src/person_name/start_extraction.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..cd39ba0 --- /dev/null +++ b/setup.py @@ -0,0 +1,6 @@ +from setuptools import setup +from Cython.Build import cythonize + +setup( + ext_modules = cythonize("src/person_name/cython_extract_methods.pyx") +) \ No newline at end of file diff --git a/src/person_name/cython_extract_methods.pyx b/src/person_name/cython_extract_methods.pyx new file mode 100755 index 0000000..3f74252 --- /dev/null +++ b/src/person_name/cython_extract_methods.pyx @@ -0,0 +1,192 @@ +import itertools +import random +from concurrent.futures import ThreadPoolExecutor + +import xlsxwriter + +from src.db.mongo_manager import MongoDbManager, MONGO_STORE_MANAGER +from src.pojo.contact_pojo import ContactPojo +from src.pojo.mail.mail_pojo import MailAddress +from src.utils.excel_reader import get_random_fr_phone_numbers, ExcelHelper +from src.utils.contacts.generate_random_passport_id import get_random_passport_id_number + +def get_ordered_combines(stuff): + list_to_return: list = [] + cdef int it_len = 3 + cdef int it_number = 2 + for i, j in itertools.combinations(range(len(stuff) + it_len), it_number): + # print(stuff[i:j]) + list_to_return.append(stuff[i:j]) + return list_to_return + +# 关键词提取 + +def read_pinyin_list_from_file(name_list_file_path="all_new_name_list.txt") -> list: + file2 = open(name_list_file_path, 'r') + lines = file2.readlines() + name_list = [] + cdef int count = 0 + for line in lines: + count += 1 + name_list.append(line.strip()) + return name_list + +def read_pinyin_first_name_from_file() -> list: + file2 = open('first_name_noDuplicates.txt', 'r') + lines = file2.readlines() + name_list = [] + cdef int count = 0 + for line in lines: + count += 1 + # print("Line{}: {}".format(count, line.strip())) + name_list.append(line.strip()) + return name_list + +def read_pinyin_last_name_from_file() -> list: + file2 = open('last_name_noDuplicates.txt', 'r') + lines = file2.readlines() + name_list = [] + cdef int count = 0 + for line in lines: + count += 1 + # print("Line{}: {}".format(count, line.strip())) + name_list.append(line.strip()) + return name_list + +def generate_new_list_from_old_name_list(): + all_last_name = read_pinyin_last_name_from_file() + all_first_name = read_pinyin_first_name_from_file() + # for name in all_name_list: + # last_name = name.split(" ")[0] + # first_name = name.split(" ")[-1] + # all_last_name.append(last_name) + # all_first_name.append(first_name) + f = open("all_new_name_list.txt", "w") + for last_name in all_last_name: + for first_name in all_first_name: + new_name = "{} {}\n".format(last_name, first_name) + f.write(str(new_name)) + f.close() + +def get_maximum_length(): + all_last_name = read_pinyin_last_name_from_file() + all_first_name = read_pinyin_first_name_from_file() + # for name in all_name_list: + # last_name = name.split(" ")[0] + # first_name = name.split(" ")[-1] + # all_last_name.append(last_name) + # all_first_name.append(first_name) + cdef int max_last_name_length = 0 + cdef int max_first_name_length = 0 + + for last_name in all_last_name: + if len(last_name) > max_last_name_length: + max_last_name_length = len(last_name) + for first_name in all_first_name: + if len(first_name) > max_first_name_length: + max_first_name_length = len(first_name) + # print("max_last_name_lenghth :" + str(max_last_name_lenghth)) + # print("max_first_name_lenghth :" + str(max_first_name_lenghth)) + +def has_numbers(inputString: str): + return any(char.isdigit() for char in inputString) + +def check_name(word_to_test: str, pinyin_name_list: list): + if "_" in word_to_test or "." in word_to_test or "v" in word_to_test: + return None + if has_numbers(word_to_test): + return None + for name in pinyin_name_list: + last_name = name.split(" ")[0] + first_name = name.split(" ")[-1] + full_name = last_name + first_name + full_name_inverse = first_name + last_name + if word_to_test.lower() in last_name.lower(): + return last_name, first_name + elif word_to_test.lower() in first_name.lower(): + return last_name, first_name + elif word_to_test.lower() in full_name.lower(): + return last_name, first_name + elif word_to_test.lower() in full_name_inverse.lower(): + return last_name, first_name + return None + +def generate_name_from_email(mail_address, pinyin_name_list): + all_combines = get_ordered_combines(mail_address) + all_combines.sort(key=len, reverse=True) + no_duplicated_list = [] + for word in all_combines: + if word not in no_duplicated_list: + no_duplicated_list.append(word) + cdef int min_length = 5, max_length = 18 + for i in no_duplicated_list: + word_to_test = "".join(i) + if min_length <= len(word_to_test) <= max_length: + found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list) + if found_name is not None: + return found_name + + # 选择不重复的 + # if len(pinyin_name_list) > 3: + # return get_better_list(pinyin_name_list) + # else: + return None + +def write_new_contacts_to_excel(valid_contacts: list): + cdef int row = 0 + cdef int col = 0 + # Create a workbook and add a worksheet. + workbook = xlsxwriter.Workbook('real_name_contacts_{}.xlsx'.format(len(valid_contacts))) + header_data = ['name', 'phone', 'passport', 'email', 'store'] + worksheet = workbook.add_worksheet() + header_format = workbook.add_format({'bold': True}) + + for col_num, data in enumerate(header_data): + worksheet.write(row, col_num, data, header_format) + row = row + 1 + for info in valid_contacts: + info.phone = get_random_fr_phone_numbers() + info.passport = get_random_passport_id_number() + # Iterate over the data and write it out row by row. + worksheet.write(row, col, "{} {}".format(info.last_name, info.first_name)) + worksheet.write(row, col + 1, info.phone) + worksheet.write(row, col + 2, info.passport) + worksheet.write(row, col + 3, info.mail) + worksheet.write(row, col + 4, "random") + row += 1 + workbook.close() + +def find_contact(generate_contacts: list, mail, pinyin_name_list: list): + contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="", + store="random") + splitted = mail.mail.split("@") + possible_name_list = generate_name_from_email(splitted[0], pinyin_name_list) + if possible_name_list is not None: + contact.last_name = possible_name_list[0] + contact.first_name = possible_name_list[1] + cdef int min_len = 0 + if len(contact.last_name) > min_len and len(contact.first_name) > min_len: + generate_contacts.append(contact) + +def generate_contact_from_mail_list(_start_position: int, _end_position: int, + name_list_file_path="all_new_name_list.txt"): + db_manager = MongoDbManager() + mail_list = db_manager.get_destination_emails()[_start_position:_end_position] + print("mail_list size before filter is {}".format(len(mail_list))) + filter_already_validated_contacts(mail_list) + print("mail_list size after filter is {}".format(len(mail_list))) + generate_contacts = [] + pinyin_name_list = read_pinyin_list_from_file(name_list_file_path) + # todo, remove the comment below + # random.shuffle(pinyin_name_list) + with ThreadPoolExecutor(max_workers=len(mail_list)) as executor: + for mail in mail_list: + executor.submit(find_contact, generate_contacts, mail, pinyin_name_list) + write_new_contacts_to_excel(generate_contacts) + +def filter_already_validated_contacts(_list_to_extract): + _already_validated_contact_list = MONGO_STORE_MANAGER.get_all_contacts_to_book() + for _validated_contact in _already_validated_contact_list: + for _extracted_contact in _list_to_extract: + if _validated_contact.mail == _extracted_contact.mail: + _list_to_extract.remove(_extracted_contact) diff --git a/src/person_name/extract_name_with_pinyinlist.py b/src/person_name/extract_name_with_pinyinlist.py index 570dd26..c281470 100755 --- a/src/person_name/extract_name_with_pinyinlist.py +++ b/src/person_name/extract_name_with_pinyinlist.py @@ -17,19 +17,10 @@ def get_ordered_combins(stuff): list_to_return.append(stuff[i:j]) return list_to_return - -def get_better_list(list): - return list - # for name in list: - # if len(name) == 2: - # list.remove(name) - # return list - - # 关键词提取 -def read_pinyin_list_from_file() -> list: - file2 = open('all_new_name_list.txt', 'r') +def read_pinyin_list_from_file(name_list_file_path="all_new_name_list.txt") -> list: + file2 = open(name_list_file_path, 'r') lines = file2.readlines() name_list = [] count = 0 @@ -210,8 +201,8 @@ def write_new_contacts_to_excel(valid_contacts: list): def find_contact(generate_contacts: list, mail, pinyin_name_list): contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="", store="random") - spliteed = mail.mail.split("@") - possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list) + splitted = mail.mail.split("@") + possible_name_list = generate_name_from_email(splitted[0], pinyin_name_list) if possible_name_list is not None: contact.last_name = possible_name_list[0] contact.first_name = possible_name_list[1] @@ -219,7 +210,7 @@ def find_contact(generate_contacts: list, mail, pinyin_name_list): generate_contacts.append(contact) -def generate_contact_from_mail_list(_start_position, _end_position): +def generate_contact_from_mail_list(_start_position, _end_position, name_list_file_path="all_new_name_list.txt"): db_manager = MongoDbManager() # mail_list = db_manager.get_destination_emails()[8501:8520] # mail_list = db_manager.get_destination_emails()[8521:8600] @@ -231,8 +222,9 @@ def generate_contact_from_mail_list(_start_position, _end_position): # excel_reader = ExcelHelper() # mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/toExtract.xlsx") generate_contacts = [] - pinyin_name_list = read_pinyin_list_from_file() - random.shuffle(pinyin_name_list) + pinyin_name_list = read_pinyin_list_from_file(name_list_file_path) + # todo, remove the comment below + # random.shuffle(pinyin_name_list) with ThreadPoolExecutor(max_workers=len(mail_list)) as executor: for mail in mail_list: executor.submit(find_contact, generate_contacts, mail, pinyin_name_list) @@ -251,9 +243,9 @@ def filter_already_validated_contacts(_list_to_extract): if __name__ == '__main__': start_position = 1 - end_position = 502 + end_position = 3 # x = range(start_position, end_position, 100) - step = 500 + step = 1 step_list = range(start_position, end_position, step) print(step_list[-1]) for x in step_list: @@ -263,7 +255,7 @@ if __name__ == '__main__': print("_from_position is {},_end_position is {}".format(_from_position, _end_position)) if _end_position <= step_list[-1]: print("start extraction from {} to {}".format(_from_position, _end_position)) - generate_contact_from_mail_list(_from_position, _end_position) + generate_contact_from_mail_list(_from_position, _end_position, name_list_file_path = "/Users/lpan/Documents/workspace/appointment_tool/src/person_name/all_new_name_list.txt") else: print("stop with {},{}".format(_from_position, _end_position)) # get_maximum_length() diff --git a/src/person_name/start_extraction.py b/src/person_name/start_extraction.py new file mode 100644 index 0000000..fc502f9 --- /dev/null +++ b/src/person_name/start_extraction.py @@ -0,0 +1,22 @@ +from cython_extract_methods import generate_contact_from_mail_list + +# from extract_name_with_pinyinlist import generate_contact_from_mail_list + +if __name__ == '__main__': + start_position = 1 + end_position = 4 + # x = range(start_position, end_position, 100) + step = 2 + step_list = range(start_position, end_position, step) + print(step_list[-1]) + for x in step_list: + print(x) + _from_position = x + _end_position = x + step + print("_from_position is {},_end_position is {}".format(_from_position, _end_position)) + if _end_position <= step_list[-1]: + print("start extraction from {} to {}".format(_from_position, _end_position)) + generate_contact_from_mail_list(_from_position, _end_position, + name_list_file_path="/Users/lpan/Documents/workspace/appointment_tool/src/person_name/all_new_name_list.txt") + else: + print("stop with {},{}".format(_from_position, _end_position)) From 2dd34eb43228e3930560cd9b545fadc43db2d28d Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Sun, 5 May 2024 21:23:18 +0200 Subject: [PATCH 3/3] optimization with cython --- src/person_name/cython_extract_methods.pyx | 71 +++++-------------- .../extract_name_with_pinyinlist.py | 15 ++-- src/person_name/start_extraction.py | 4 +- 3 files changed, 26 insertions(+), 64 deletions(-) diff --git a/src/person_name/cython_extract_methods.pyx b/src/person_name/cython_extract_methods.pyx index 3f74252..2489c12 100755 --- a/src/person_name/cython_extract_methods.pyx +++ b/src/person_name/cython_extract_methods.pyx @@ -1,17 +1,17 @@ import itertools import random from concurrent.futures import ThreadPoolExecutor +from typing import Type import xlsxwriter from src.db.mongo_manager import MongoDbManager, MONGO_STORE_MANAGER from src.pojo.contact_pojo import ContactPojo -from src.pojo.mail.mail_pojo import MailAddress -from src.utils.excel_reader import get_random_fr_phone_numbers, ExcelHelper from src.utils.contacts.generate_random_passport_id import get_random_passport_id_number +from src.utils.excel_reader import get_random_fr_phone_numbers def get_ordered_combines(stuff): - list_to_return: list = [] + cdef list list_to_return = [] cdef int it_len = 3 cdef int it_number = 2 for i, j in itertools.combinations(range(len(stuff) + it_len), it_number): @@ -24,69 +24,34 @@ def get_ordered_combines(stuff): def read_pinyin_list_from_file(name_list_file_path="all_new_name_list.txt") -> list: file2 = open(name_list_file_path, 'r') lines = file2.readlines() - name_list = [] + cdef list name_list = [] cdef int count = 0 for line in lines: count += 1 name_list.append(line.strip()) - return name_list + return list(name_list) def read_pinyin_first_name_from_file() -> list: file2 = open('first_name_noDuplicates.txt', 'r') lines = file2.readlines() - name_list = [] + cdef list name_list = [] cdef int count = 0 for line in lines: count += 1 # print("Line{}: {}".format(count, line.strip())) name_list.append(line.strip()) - return name_list + return list(name_list) def read_pinyin_last_name_from_file() -> list: file2 = open('last_name_noDuplicates.txt', 'r') lines = file2.readlines() - name_list = [] + cdef list name_list = [] cdef int count = 0 for line in lines: count += 1 # print("Line{}: {}".format(count, line.strip())) name_list.append(line.strip()) - return name_list - -def generate_new_list_from_old_name_list(): - all_last_name = read_pinyin_last_name_from_file() - all_first_name = read_pinyin_first_name_from_file() - # for name in all_name_list: - # last_name = name.split(" ")[0] - # first_name = name.split(" ")[-1] - # all_last_name.append(last_name) - # all_first_name.append(first_name) - f = open("all_new_name_list.txt", "w") - for last_name in all_last_name: - for first_name in all_first_name: - new_name = "{} {}\n".format(last_name, first_name) - f.write(str(new_name)) - f.close() - -def get_maximum_length(): - all_last_name = read_pinyin_last_name_from_file() - all_first_name = read_pinyin_first_name_from_file() - # for name in all_name_list: - # last_name = name.split(" ")[0] - # first_name = name.split(" ")[-1] - # all_last_name.append(last_name) - # all_first_name.append(first_name) - cdef int max_last_name_length = 0 - cdef int max_first_name_length = 0 - - for last_name in all_last_name: - if len(last_name) > max_last_name_length: - max_last_name_length = len(last_name) - for first_name in all_first_name: - if len(first_name) > max_first_name_length: - max_first_name_length = len(first_name) - # print("max_last_name_lenghth :" + str(max_last_name_lenghth)) - # print("max_first_name_lenghth :" + str(max_first_name_lenghth)) + return list(name_list) def has_numbers(inputString: str): return any(char.isdigit() for char in inputString) @@ -114,12 +79,13 @@ def check_name(word_to_test: str, pinyin_name_list: list): def generate_name_from_email(mail_address, pinyin_name_list): all_combines = get_ordered_combines(mail_address) all_combines.sort(key=len, reverse=True) - no_duplicated_list = [] - for word in all_combines: - if word not in no_duplicated_list: + cdef list no_duplicated_list = [] + cdef int min_length = 4, max_length = 18 + for word in list(all_combines): + if word not in list(no_duplicated_list) and len(word) >= min_length: + print("add word {}".format(word)) no_duplicated_list.append(word) - cdef int min_length = 5, max_length = 18 - for i in no_duplicated_list: + for i in list(no_duplicated_list): word_to_test = "".join(i) if min_length <= len(word_to_test) <= max_length: found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list) @@ -132,7 +98,7 @@ def generate_name_from_email(mail_address, pinyin_name_list): # else: return None -def write_new_contacts_to_excel(valid_contacts: list): +def write_new_contacts_to_excel(valid_contacts: Type[list]): cdef int row = 0 cdef int col = 0 # Create a workbook and add a worksheet. @@ -156,7 +122,7 @@ def write_new_contacts_to_excel(valid_contacts: list): row += 1 workbook.close() -def find_contact(generate_contacts: list, mail, pinyin_name_list: list): +def find_contact(generate_contacts: Type[list], mail, pinyin_name_list: list): contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="", store="random") splitted = mail.mail.split("@") @@ -175,9 +141,8 @@ def generate_contact_from_mail_list(_start_position: int, _end_position: int, print("mail_list size before filter is {}".format(len(mail_list))) filter_already_validated_contacts(mail_list) print("mail_list size after filter is {}".format(len(mail_list))) - generate_contacts = [] + cdef list generate_contacts = [] pinyin_name_list = read_pinyin_list_from_file(name_list_file_path) - # todo, remove the comment below # random.shuffle(pinyin_name_list) with ThreadPoolExecutor(max_workers=len(mail_list)) as executor: for mail in mail_list: diff --git a/src/person_name/extract_name_with_pinyinlist.py b/src/person_name/extract_name_with_pinyinlist.py index c281470..5408d9a 100755 --- a/src/person_name/extract_name_with_pinyinlist.py +++ b/src/person_name/extract_name_with_pinyinlist.py @@ -17,6 +17,7 @@ def get_ordered_combins(stuff): list_to_return.append(stuff[i:j]) return list_to_return + # 关键词提取 def read_pinyin_list_from_file(name_list_file_path="all_new_name_list.txt") -> list: @@ -210,21 +211,16 @@ def find_contact(generate_contacts: list, mail, pinyin_name_list): generate_contacts.append(contact) -def generate_contact_from_mail_list(_start_position, _end_position, name_list_file_path="all_new_name_list.txt"): +def generate_contact_from_mail_list(_start_position: int, _end_position: int, + name_list_file_path="all_new_name_list.txt"): db_manager = MongoDbManager() - # mail_list = db_manager.get_destination_emails()[8501:8520] - # mail_list = db_manager.get_destination_emails()[8521:8600] mail_list = db_manager.get_destination_emails()[_start_position:_end_position] print("mail_list size before filter is {}".format(len(mail_list))) filter_already_validated_contacts(mail_list) print("mail_list size after filter is {}".format(len(mail_list))) - # mail_list = db_manager.get_destination_emails()[9323:9914] - # excel_reader = ExcelHelper() - # mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/toExtract.xlsx") generate_contacts = [] pinyin_name_list = read_pinyin_list_from_file(name_list_file_path) - # todo, remove the comment below - # random.shuffle(pinyin_name_list) + random.shuffle(pinyin_name_list) with ThreadPoolExecutor(max_workers=len(mail_list)) as executor: for mail in mail_list: executor.submit(find_contact, generate_contacts, mail, pinyin_name_list) @@ -255,7 +251,8 @@ if __name__ == '__main__': print("_from_position is {},_end_position is {}".format(_from_position, _end_position)) if _end_position <= step_list[-1]: print("start extraction from {} to {}".format(_from_position, _end_position)) - generate_contact_from_mail_list(_from_position, _end_position, name_list_file_path = "/Users/lpan/Documents/workspace/appointment_tool/src/person_name/all_new_name_list.txt") + generate_contact_from_mail_list(_from_position, _end_position, + name_list_file_path="/Users/lpan/Documents/workspace/appointment_tool/src/person_name/all_new_name_list.txt") else: print("stop with {},{}".format(_from_position, _end_position)) # get_maximum_length() diff --git a/src/person_name/start_extraction.py b/src/person_name/start_extraction.py index fc502f9..8a6b9db 100644 --- a/src/person_name/start_extraction.py +++ b/src/person_name/start_extraction.py @@ -4,9 +4,9 @@ from cython_extract_methods import generate_contact_from_mail_list if __name__ == '__main__': start_position = 1 - end_position = 4 + end_position = 202 # x = range(start_position, end_position, 100) - step = 2 + step = 200 step_list = range(start_position, end_position, step) print(step_list[-1]) for x in step_list: