optimization with cython

2024-05-05 17:03:06 +02:00
parent 609fa6c4e7
commit c7713079a4
4 changed files with 231 additions and 19 deletions
@@ -0,0 +1,6 @@
 from setuptools import setup
 from Cython.Build import cythonize
 setup(
    ext_modules = cythonize("src/person_name/cython_extract_methods.pyx")
 )
@@ -0,0 +1,192 @@
 import itertools
 import random
 from concurrent.futures import ThreadPoolExecutor
 import xlsxwriter
 from src.db.mongo_manager import MongoDbManager, MONGO_STORE_MANAGER
 from src.pojo.contact_pojo import ContactPojo
 from src.pojo.mail.mail_pojo import MailAddress
 from src.utils.excel_reader import get_random_fr_phone_numbers, ExcelHelper
 from src.utils.contacts.generate_random_passport_id import get_random_passport_id_number
 def get_ordered_combines(stuff):
    list_to_return: list = []
    cdef int it_len = 3
    cdef int it_number = 2
    for i, j in itertools.combinations(range(len(stuff) + it_len), it_number):
        # print(stuff[i:j])
        list_to_return.append(stuff[i:j])
    return list_to_return
 # 关键词提取
 def read_pinyin_list_from_file(name_list_file_path="all_new_name_list.txt") -> list:
    file2 = open(name_list_file_path, 'r')
    lines = file2.readlines()
    name_list = []
    cdef int count = 0
    for line in lines:
        count += 1
        name_list.append(line.strip())
    return name_list
 def read_pinyin_first_name_from_file() -> list:
    file2 = open('first_name_noDuplicates.txt', 'r')
    lines = file2.readlines()
    name_list = []
    cdef int count = 0
    for line in lines:
        count += 1
        # print("Line{}: {}".format(count, line.strip()))
        name_list.append(line.strip())
    return name_list
 def read_pinyin_last_name_from_file() -> list:
    file2 = open('last_name_noDuplicates.txt', 'r')
    lines = file2.readlines()
    name_list = []
    cdef int count = 0
    for line in lines:
        count += 1
        # print("Line{}: {}".format(count, line.strip()))
        name_list.append(line.strip())
    return name_list
 def generate_new_list_from_old_name_list():
    all_last_name = read_pinyin_last_name_from_file()
    all_first_name = read_pinyin_first_name_from_file()
    # for name in all_name_list:
    #     last_name = name.split(" ")[0]
    #     first_name = name.split(" ")[-1]
    #     all_last_name.append(last_name)
    #     all_first_name.append(first_name)
    f = open("all_new_name_list.txt", "w")
    for last_name in all_last_name:
        for first_name in all_first_name:
            new_name = "{} {}\n".format(last_name, first_name)
            f.write(str(new_name))
    f.close()
 def get_maximum_length():
    all_last_name = read_pinyin_last_name_from_file()
    all_first_name = read_pinyin_first_name_from_file()
    # for name in all_name_list:
    #     last_name = name.split(" ")[0]
    #     first_name = name.split(" ")[-1]
    #     all_last_name.append(last_name)
    #     all_first_name.append(first_name)
    cdef int max_last_name_length = 0
    cdef int max_first_name_length = 0
    for last_name in all_last_name:
        if len(last_name) > max_last_name_length:
            max_last_name_length = len(last_name)
    for first_name in all_first_name:
        if len(first_name) > max_first_name_length:
            max_first_name_length = len(first_name)
    # print("max_last_name_lenghth :" + str(max_last_name_lenghth))
    # print("max_first_name_lenghth :" + str(max_first_name_lenghth))
 def has_numbers(inputString: str):
    return any(char.isdigit() for char in inputString)
 def check_name(word_to_test: str, pinyin_name_list: list):
    if "_" in word_to_test or "." in word_to_test or "v" in word_to_test:
        return None
    if has_numbers(word_to_test):
        return None
    for name in pinyin_name_list:
        last_name = name.split(" ")[0]
        first_name = name.split(" ")[-1]
        full_name = last_name + first_name
        full_name_inverse = first_name + last_name
        if word_to_test.lower() in last_name.lower():
            return last_name, first_name
        elif word_to_test.lower() in first_name.lower():
            return last_name, first_name
        elif word_to_test.lower() in full_name.lower():
            return last_name, first_name
        elif word_to_test.lower() in full_name_inverse.lower():
            return last_name, first_name
    return None
 def generate_name_from_email(mail_address, pinyin_name_list):
    all_combines = get_ordered_combines(mail_address)
    all_combines.sort(key=len, reverse=True)
    no_duplicated_list = []
    for word in all_combines:
        if word not in no_duplicated_list:
            no_duplicated_list.append(word)
    cdef int min_length = 5, max_length = 18
    for i in no_duplicated_list:
        word_to_test = "".join(i)
        if min_length <= len(word_to_test) <= max_length:
            found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
            if found_name is not None:
                return found_name
    # 选择不重复的
    # if len(pinyin_name_list) > 3:
    #     return get_better_list(pinyin_name_list)
    # else:
    return None
 def write_new_contacts_to_excel(valid_contacts: list):
    cdef int row = 0
    cdef int col = 0
    # Create a workbook and add a worksheet.
    workbook = xlsxwriter.Workbook('real_name_contacts_{}.xlsx'.format(len(valid_contacts)))
    header_data = ['name', 'phone', 'passport', 'email', 'store']
    worksheet = workbook.add_worksheet()
    header_format = workbook.add_format({'bold': True})
    for col_num, data in enumerate(header_data):
        worksheet.write(row, col_num, data, header_format)
    row = row + 1
    for info in valid_contacts:
        info.phone = get_random_fr_phone_numbers()
        info.passport = get_random_passport_id_number()
        # Iterate over the data and write it out row by row.
        worksheet.write(row, col, "{} {}".format(info.last_name, info.first_name))
        worksheet.write(row, col + 1, info.phone)
        worksheet.write(row, col + 2, info.passport)
        worksheet.write(row, col + 3, info.mail)
        worksheet.write(row, col + 4, "random")
        row += 1
    workbook.close()
 def find_contact(generate_contacts: list, mail, pinyin_name_list: list):
    contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="",
                          store="random")
    splitted = mail.mail.split("@")
    possible_name_list = generate_name_from_email(splitted[0], pinyin_name_list)
    if possible_name_list is not None:
        contact.last_name = possible_name_list[0]
        contact.first_name = possible_name_list[1]
    cdef int min_len = 0
    if len(contact.last_name) > min_len and len(contact.first_name) > min_len:
        generate_contacts.append(contact)
 def generate_contact_from_mail_list(_start_position: int, _end_position: int,
                                    name_list_file_path="all_new_name_list.txt"):
    db_manager = MongoDbManager()
    mail_list = db_manager.get_destination_emails()[_start_position:_end_position]
    print("mail_list size before filter is {}".format(len(mail_list)))
    filter_already_validated_contacts(mail_list)
    print("mail_list size after filter is {}".format(len(mail_list)))
    generate_contacts = []
    pinyin_name_list = read_pinyin_list_from_file(name_list_file_path)
    # todo, remove the comment below
    # random.shuffle(pinyin_name_list)
    with ThreadPoolExecutor(max_workers=len(mail_list)) as executor:
        for mail in mail_list:
            executor.submit(find_contact, generate_contacts, mail, pinyin_name_list)
    write_new_contacts_to_excel(generate_contacts)
 def filter_already_validated_contacts(_list_to_extract):
    _already_validated_contact_list = MONGO_STORE_MANAGER.get_all_contacts_to_book()
    for _validated_contact in _already_validated_contact_list:
        for _extracted_contact in _list_to_extract:
            if _validated_contact.mail == _extracted_contact.mail:
                _list_to_extract.remove(_extracted_contact)
@@ -17,19 +17,10 @@ def get_ordered_combins(stuff):
        list_to_return.append(stuff[i:j])
    return list_to_return
 def get_better_list(list):
    return list
    # for name in list:
    #     if len(name) == 2:
    #         list.remove(name)
    #         return list
 # 关键词提取
-def read_pinyin_list_from_file() -> list:
+def read_pinyin_list_from_file(name_list_file_path="all_new_name_list.txt") -> list:
-    file2 = open('all_new_name_list.txt', 'r')
+    file2 = open(name_list_file_path, 'r')
    lines = file2.readlines()
    name_list = []
    count = 0
@@ -210,8 +201,8 @@ def write_new_contacts_to_excel(valid_contacts: list):
 def find_contact(generate_contacts: list, mail, pinyin_name_list):
    contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="",
                          store="random")
-    spliteed = mail.mail.split("@")
+    splitted = mail.mail.split("@")
-    possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list)
+    possible_name_list = generate_name_from_email(splitted[0], pinyin_name_list)
    if possible_name_list is not None:
        contact.last_name = possible_name_list[0]
        contact.first_name = possible_name_list[1]
@@ -219,7 +210,7 @@ def find_contact(generate_contacts: list, mail, pinyin_name_list):
        generate_contacts.append(contact)
-def generate_contact_from_mail_list(_start_position, _end_position):
+def generate_contact_from_mail_list(_start_position, _end_position, name_list_file_path="all_new_name_list.txt"):
    db_manager = MongoDbManager()
    # mail_list = db_manager.get_destination_emails()[8501:8520]
    # mail_list = db_manager.get_destination_emails()[8521:8600]
@@ -231,8 +222,9 @@ def generate_contact_from_mail_list(_start_position, _end_position):
    # excel_reader = ExcelHelper()
    # mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/toExtract.xlsx")
    generate_contacts = []
-    pinyin_name_list = read_pinyin_list_from_file()
+    pinyin_name_list = read_pinyin_list_from_file(name_list_file_path)
-    random.shuffle(pinyin_name_list)
+    # todo, remove the comment below
    # random.shuffle(pinyin_name_list)
    with ThreadPoolExecutor(max_workers=len(mail_list)) as executor:
        for mail in mail_list:
            executor.submit(find_contact, generate_contacts, mail, pinyin_name_list)
@@ -251,9 +243,9 @@ def filter_already_validated_contacts(_list_to_extract):
 if __name__ == '__main__':
    start_position = 1
-    end_position = 502
+    end_position = 3
    # x = range(start_position, end_position, 100)
-    step = 500
+    step = 1
    step_list = range(start_position, end_position, step)
    print(step_list[-1])
    for x in step_list:
@@ -263,7 +255,7 @@ if __name__ == '__main__':
        print("_from_position is {},_end_position is {}".format(_from_position, _end_position))
        if _end_position <= step_list[-1]:
            print("start extraction from {} to {}".format(_from_position, _end_position))
-            generate_contact_from_mail_list(_from_position, _end_position)
+            generate_contact_from_mail_list(_from_position, _end_position, name_list_file_path = "/Users/lpan/Documents/workspace/appointment_tool/src/person_name/all_new_name_list.txt")
        else:
            print("stop with {},{}".format(_from_position, _end_position))
    # get_maximum_length()
@@ -0,0 +1,22 @@
 from cython_extract_methods import generate_contact_from_mail_list
 # from extract_name_with_pinyinlist import generate_contact_from_mail_list
 if __name__ == '__main__':
    start_position = 1
    end_position = 4
    # x = range(start_position, end_position, 100)
    step = 2
    step_list = range(start_position, end_position, step)
    print(step_list[-1])
    for x in step_list:
        print(x)
        _from_position = x
        _end_position = x + step
        print("_from_position is {},_end_position is {}".format(_from_position, _end_position))
        if _end_position <= step_list[-1]:
            print("start extraction from {} to {}".format(_from_position, _end_position))
            generate_contact_from_mail_list(_from_position, _end_position,
                                            name_list_file_path="/Users/lpan/Documents/workspace/appointment_tool/src/person_name/all_new_name_list.txt")
        else:
            print("stop with {},{}".format(_from_position, _end_position))