optimization with cython

This commit is contained in:
2024-05-05 21:23:18 +02:00
parent c7713079a4
commit 2dd34eb432
3 changed files with 26 additions and 64 deletions
+18 -53
View File
@@ -1,17 +1,17 @@
import itertools import itertools
import random import random
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from typing import Type
import xlsxwriter import xlsxwriter
from src.db.mongo_manager import MongoDbManager, MONGO_STORE_MANAGER from src.db.mongo_manager import MongoDbManager, MONGO_STORE_MANAGER
from src.pojo.contact_pojo import ContactPojo from src.pojo.contact_pojo import ContactPojo
from src.pojo.mail.mail_pojo import MailAddress
from src.utils.excel_reader import get_random_fr_phone_numbers, ExcelHelper
from src.utils.contacts.generate_random_passport_id import get_random_passport_id_number from src.utils.contacts.generate_random_passport_id import get_random_passport_id_number
from src.utils.excel_reader import get_random_fr_phone_numbers
def get_ordered_combines(stuff): def get_ordered_combines(stuff):
list_to_return: list = [] cdef list list_to_return = []
cdef int it_len = 3 cdef int it_len = 3
cdef int it_number = 2 cdef int it_number = 2
for i, j in itertools.combinations(range(len(stuff) + it_len), it_number): for i, j in itertools.combinations(range(len(stuff) + it_len), it_number):
@@ -24,69 +24,34 @@ def get_ordered_combines(stuff):
def read_pinyin_list_from_file(name_list_file_path="all_new_name_list.txt") -> list: def read_pinyin_list_from_file(name_list_file_path="all_new_name_list.txt") -> list:
file2 = open(name_list_file_path, 'r') file2 = open(name_list_file_path, 'r')
lines = file2.readlines() lines = file2.readlines()
name_list = [] cdef list name_list = []
cdef int count = 0 cdef int count = 0
for line in lines: for line in lines:
count += 1 count += 1
name_list.append(line.strip()) name_list.append(line.strip())
return name_list return list(name_list)
def read_pinyin_first_name_from_file() -> list: def read_pinyin_first_name_from_file() -> list:
file2 = open('first_name_noDuplicates.txt', 'r') file2 = open('first_name_noDuplicates.txt', 'r')
lines = file2.readlines() lines = file2.readlines()
name_list = [] cdef list name_list = []
cdef int count = 0 cdef int count = 0
for line in lines: for line in lines:
count += 1 count += 1
# print("Line{}: {}".format(count, line.strip())) # print("Line{}: {}".format(count, line.strip()))
name_list.append(line.strip()) name_list.append(line.strip())
return name_list return list(name_list)
def read_pinyin_last_name_from_file() -> list: def read_pinyin_last_name_from_file() -> list:
file2 = open('last_name_noDuplicates.txt', 'r') file2 = open('last_name_noDuplicates.txt', 'r')
lines = file2.readlines() lines = file2.readlines()
name_list = [] cdef list name_list = []
cdef int count = 0 cdef int count = 0
for line in lines: for line in lines:
count += 1 count += 1
# print("Line{}: {}".format(count, line.strip())) # print("Line{}: {}".format(count, line.strip()))
name_list.append(line.strip()) name_list.append(line.strip())
return name_list return list(name_list)
def generate_new_list_from_old_name_list():
all_last_name = read_pinyin_last_name_from_file()
all_first_name = read_pinyin_first_name_from_file()
# for name in all_name_list:
# last_name = name.split(" ")[0]
# first_name = name.split(" ")[-1]
# all_last_name.append(last_name)
# all_first_name.append(first_name)
f = open("all_new_name_list.txt", "w")
for last_name in all_last_name:
for first_name in all_first_name:
new_name = "{} {}\n".format(last_name, first_name)
f.write(str(new_name))
f.close()
def get_maximum_length():
all_last_name = read_pinyin_last_name_from_file()
all_first_name = read_pinyin_first_name_from_file()
# for name in all_name_list:
# last_name = name.split(" ")[0]
# first_name = name.split(" ")[-1]
# all_last_name.append(last_name)
# all_first_name.append(first_name)
cdef int max_last_name_length = 0
cdef int max_first_name_length = 0
for last_name in all_last_name:
if len(last_name) > max_last_name_length:
max_last_name_length = len(last_name)
for first_name in all_first_name:
if len(first_name) > max_first_name_length:
max_first_name_length = len(first_name)
# print("max_last_name_lenghth :" + str(max_last_name_lenghth))
# print("max_first_name_lenghth :" + str(max_first_name_lenghth))
def has_numbers(inputString: str): def has_numbers(inputString: str):
return any(char.isdigit() for char in inputString) return any(char.isdigit() for char in inputString)
@@ -114,12 +79,13 @@ def check_name(word_to_test: str, pinyin_name_list: list):
def generate_name_from_email(mail_address, pinyin_name_list): def generate_name_from_email(mail_address, pinyin_name_list):
all_combines = get_ordered_combines(mail_address) all_combines = get_ordered_combines(mail_address)
all_combines.sort(key=len, reverse=True) all_combines.sort(key=len, reverse=True)
no_duplicated_list = [] cdef list no_duplicated_list = []
for word in all_combines: cdef int min_length = 4, max_length = 18
if word not in no_duplicated_list: for word in list(all_combines):
if word not in list(no_duplicated_list) and len(word) >= min_length:
print("add word {}".format(word))
no_duplicated_list.append(word) no_duplicated_list.append(word)
cdef int min_length = 5, max_length = 18 for i in list(no_duplicated_list):
for i in no_duplicated_list:
word_to_test = "".join(i) word_to_test = "".join(i)
if min_length <= len(word_to_test) <= max_length: if min_length <= len(word_to_test) <= max_length:
found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list) found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
@@ -132,7 +98,7 @@ def generate_name_from_email(mail_address, pinyin_name_list):
# else: # else:
return None return None
def write_new_contacts_to_excel(valid_contacts: list): def write_new_contacts_to_excel(valid_contacts: Type[list]):
cdef int row = 0 cdef int row = 0
cdef int col = 0 cdef int col = 0
# Create a workbook and add a worksheet. # Create a workbook and add a worksheet.
@@ -156,7 +122,7 @@ def write_new_contacts_to_excel(valid_contacts: list):
row += 1 row += 1
workbook.close() workbook.close()
def find_contact(generate_contacts: list, mail, pinyin_name_list: list): def find_contact(generate_contacts: Type[list], mail, pinyin_name_list: list):
contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="", contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="",
store="random") store="random")
splitted = mail.mail.split("@") splitted = mail.mail.split("@")
@@ -175,9 +141,8 @@ def generate_contact_from_mail_list(_start_position: int, _end_position: int,
print("mail_list size before filter is {}".format(len(mail_list))) print("mail_list size before filter is {}".format(len(mail_list)))
filter_already_validated_contacts(mail_list) filter_already_validated_contacts(mail_list)
print("mail_list size after filter is {}".format(len(mail_list))) print("mail_list size after filter is {}".format(len(mail_list)))
generate_contacts = [] cdef list generate_contacts = []
pinyin_name_list = read_pinyin_list_from_file(name_list_file_path) pinyin_name_list = read_pinyin_list_from_file(name_list_file_path)
# todo, remove the comment below
# random.shuffle(pinyin_name_list) # random.shuffle(pinyin_name_list)
with ThreadPoolExecutor(max_workers=len(mail_list)) as executor: with ThreadPoolExecutor(max_workers=len(mail_list)) as executor:
for mail in mail_list: for mail in mail_list:
@@ -17,6 +17,7 @@ def get_ordered_combins(stuff):
list_to_return.append(stuff[i:j]) list_to_return.append(stuff[i:j])
return list_to_return return list_to_return
# 关键词提取 # 关键词提取
def read_pinyin_list_from_file(name_list_file_path="all_new_name_list.txt") -> list: def read_pinyin_list_from_file(name_list_file_path="all_new_name_list.txt") -> list:
@@ -210,21 +211,16 @@ def find_contact(generate_contacts: list, mail, pinyin_name_list):
generate_contacts.append(contact) generate_contacts.append(contact)
def generate_contact_from_mail_list(_start_position, _end_position, name_list_file_path="all_new_name_list.txt"): def generate_contact_from_mail_list(_start_position: int, _end_position: int,
name_list_file_path="all_new_name_list.txt"):
db_manager = MongoDbManager() db_manager = MongoDbManager()
# mail_list = db_manager.get_destination_emails()[8501:8520]
# mail_list = db_manager.get_destination_emails()[8521:8600]
mail_list = db_manager.get_destination_emails()[_start_position:_end_position] mail_list = db_manager.get_destination_emails()[_start_position:_end_position]
print("mail_list size before filter is {}".format(len(mail_list))) print("mail_list size before filter is {}".format(len(mail_list)))
filter_already_validated_contacts(mail_list) filter_already_validated_contacts(mail_list)
print("mail_list size after filter is {}".format(len(mail_list))) print("mail_list size after filter is {}".format(len(mail_list)))
# mail_list = db_manager.get_destination_emails()[9323:9914]
# excel_reader = ExcelHelper()
# mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/toExtract.xlsx")
generate_contacts = [] generate_contacts = []
pinyin_name_list = read_pinyin_list_from_file(name_list_file_path) pinyin_name_list = read_pinyin_list_from_file(name_list_file_path)
# todo, remove the comment below random.shuffle(pinyin_name_list)
# random.shuffle(pinyin_name_list)
with ThreadPoolExecutor(max_workers=len(mail_list)) as executor: with ThreadPoolExecutor(max_workers=len(mail_list)) as executor:
for mail in mail_list: for mail in mail_list:
executor.submit(find_contact, generate_contacts, mail, pinyin_name_list) executor.submit(find_contact, generate_contacts, mail, pinyin_name_list)
@@ -255,7 +251,8 @@ if __name__ == '__main__':
print("_from_position is {},_end_position is {}".format(_from_position, _end_position)) print("_from_position is {},_end_position is {}".format(_from_position, _end_position))
if _end_position <= step_list[-1]: if _end_position <= step_list[-1]:
print("start extraction from {} to {}".format(_from_position, _end_position)) print("start extraction from {} to {}".format(_from_position, _end_position))
generate_contact_from_mail_list(_from_position, _end_position, name_list_file_path = "/Users/lpan/Documents/workspace/appointment_tool/src/person_name/all_new_name_list.txt") generate_contact_from_mail_list(_from_position, _end_position,
name_list_file_path="/Users/lpan/Documents/workspace/appointment_tool/src/person_name/all_new_name_list.txt")
else: else:
print("stop with {},{}".format(_from_position, _end_position)) print("stop with {},{}".format(_from_position, _end_position))
# get_maximum_length() # get_maximum_length()
+2 -2
View File
@@ -4,9 +4,9 @@ from cython_extract_methods import generate_contact_from_mail_list
if __name__ == '__main__': if __name__ == '__main__':
start_position = 1 start_position = 1
end_position = 4 end_position = 202
# x = range(start_position, end_position, 100) # x = range(start_position, end_position, 100)
step = 2 step = 200
step_list = range(start_position, end_position, step) step_list = range(start_position, end_position, step)
print(step_list[-1]) print(step_list[-1])
for x in step_list: for x in step_list: