Merge branch 'master' of bitbucket.org:panleicim/appointment_tool
This commit is contained in:
@@ -0,0 +1,6 @@
|
|||||||
|
from setuptools import setup
|
||||||
|
from Cython.Build import cythonize
|
||||||
|
|
||||||
|
setup(
|
||||||
|
ext_modules = cythonize("src/person_name/cython_extract_methods.pyx")
|
||||||
|
)
|
||||||
Executable
+157
@@ -0,0 +1,157 @@
|
|||||||
|
import itertools
|
||||||
|
import random
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
|
import xlsxwriter
|
||||||
|
|
||||||
|
from src.db.mongo_manager import MongoDbManager, MONGO_STORE_MANAGER
|
||||||
|
from src.pojo.contact_pojo import ContactPojo
|
||||||
|
from src.utils.contacts.generate_random_passport_id import get_random_passport_id_number
|
||||||
|
from src.utils.excel_reader import get_random_fr_phone_numbers
|
||||||
|
|
||||||
|
def get_ordered_combines(stuff):
|
||||||
|
cdef list list_to_return = []
|
||||||
|
cdef int it_len = 3
|
||||||
|
cdef int it_number = 2
|
||||||
|
for i, j in itertools.combinations(range(len(stuff) + it_len), it_number):
|
||||||
|
# print(stuff[i:j])
|
||||||
|
list_to_return.append(stuff[i:j])
|
||||||
|
return list_to_return
|
||||||
|
|
||||||
|
# 关键词提取
|
||||||
|
|
||||||
|
def read_pinyin_list_from_file(name_list_file_path="all_new_name_list.txt") -> list:
|
||||||
|
file2 = open(name_list_file_path, 'r')
|
||||||
|
lines = file2.readlines()
|
||||||
|
cdef list name_list = []
|
||||||
|
cdef int count = 0
|
||||||
|
for line in lines:
|
||||||
|
count += 1
|
||||||
|
name_list.append(line.strip())
|
||||||
|
return list(name_list)
|
||||||
|
|
||||||
|
def read_pinyin_first_name_from_file() -> list:
|
||||||
|
file2 = open('first_name_noDuplicates.txt', 'r')
|
||||||
|
lines = file2.readlines()
|
||||||
|
cdef list name_list = []
|
||||||
|
cdef int count = 0
|
||||||
|
for line in lines:
|
||||||
|
count += 1
|
||||||
|
# print("Line{}: {}".format(count, line.strip()))
|
||||||
|
name_list.append(line.strip())
|
||||||
|
return list(name_list)
|
||||||
|
|
||||||
|
def read_pinyin_last_name_from_file() -> list:
|
||||||
|
file2 = open('last_name_noDuplicates.txt', 'r')
|
||||||
|
lines = file2.readlines()
|
||||||
|
cdef list name_list = []
|
||||||
|
cdef int count = 0
|
||||||
|
for line in lines:
|
||||||
|
count += 1
|
||||||
|
# print("Line{}: {}".format(count, line.strip()))
|
||||||
|
name_list.append(line.strip())
|
||||||
|
return list(name_list)
|
||||||
|
|
||||||
|
def has_numbers(inputString: str):
|
||||||
|
return any(char.isdigit() for char in inputString)
|
||||||
|
|
||||||
|
def check_name(word_to_test: str, pinyin_name_list: list):
|
||||||
|
if "_" in word_to_test or "." in word_to_test or "v" in word_to_test:
|
||||||
|
return None
|
||||||
|
if has_numbers(word_to_test):
|
||||||
|
return None
|
||||||
|
for name in pinyin_name_list:
|
||||||
|
last_name = name.split(" ")[0]
|
||||||
|
first_name = name.split(" ")[-1]
|
||||||
|
full_name = last_name + first_name
|
||||||
|
full_name_inverse = first_name + last_name
|
||||||
|
if word_to_test.lower() in last_name.lower():
|
||||||
|
return last_name, first_name
|
||||||
|
elif word_to_test.lower() in first_name.lower():
|
||||||
|
return last_name, first_name
|
||||||
|
elif word_to_test.lower() in full_name.lower():
|
||||||
|
return last_name, first_name
|
||||||
|
elif word_to_test.lower() in full_name_inverse.lower():
|
||||||
|
return last_name, first_name
|
||||||
|
return None
|
||||||
|
|
||||||
|
def generate_name_from_email(mail_address, pinyin_name_list):
|
||||||
|
all_combines = get_ordered_combines(mail_address)
|
||||||
|
all_combines.sort(key=len, reverse=True)
|
||||||
|
cdef list no_duplicated_list = []
|
||||||
|
cdef int min_length = 4, max_length = 18
|
||||||
|
for word in list(all_combines):
|
||||||
|
if word not in list(no_duplicated_list) and len(word) >= min_length:
|
||||||
|
print("add word {}".format(word))
|
||||||
|
no_duplicated_list.append(word)
|
||||||
|
for i in list(no_duplicated_list):
|
||||||
|
word_to_test = "".join(i)
|
||||||
|
if min_length <= len(word_to_test) <= max_length:
|
||||||
|
found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list)
|
||||||
|
if found_name is not None:
|
||||||
|
return found_name
|
||||||
|
|
||||||
|
# 选择不重复的
|
||||||
|
# if len(pinyin_name_list) > 3:
|
||||||
|
# return get_better_list(pinyin_name_list)
|
||||||
|
# else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def write_new_contacts_to_excel(valid_contacts: Type[list]):
|
||||||
|
cdef int row = 0
|
||||||
|
cdef int col = 0
|
||||||
|
# Create a workbook and add a worksheet.
|
||||||
|
workbook = xlsxwriter.Workbook('real_name_contacts_{}.xlsx'.format(len(valid_contacts)))
|
||||||
|
header_data = ['name', 'phone', 'passport', 'email', 'store']
|
||||||
|
worksheet = workbook.add_worksheet()
|
||||||
|
header_format = workbook.add_format({'bold': True})
|
||||||
|
|
||||||
|
for col_num, data in enumerate(header_data):
|
||||||
|
worksheet.write(row, col_num, data, header_format)
|
||||||
|
row = row + 1
|
||||||
|
for info in valid_contacts:
|
||||||
|
info.phone = get_random_fr_phone_numbers()
|
||||||
|
info.passport = get_random_passport_id_number()
|
||||||
|
# Iterate over the data and write it out row by row.
|
||||||
|
worksheet.write(row, col, "{} {}".format(info.last_name, info.first_name))
|
||||||
|
worksheet.write(row, col + 1, info.phone)
|
||||||
|
worksheet.write(row, col + 2, info.passport)
|
||||||
|
worksheet.write(row, col + 3, info.mail)
|
||||||
|
worksheet.write(row, col + 4, "random")
|
||||||
|
row += 1
|
||||||
|
workbook.close()
|
||||||
|
|
||||||
|
def find_contact(generate_contacts: Type[list], mail, pinyin_name_list: list):
|
||||||
|
contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="",
|
||||||
|
store="random")
|
||||||
|
splitted = mail.mail.split("@")
|
||||||
|
possible_name_list = generate_name_from_email(splitted[0], pinyin_name_list)
|
||||||
|
if possible_name_list is not None:
|
||||||
|
contact.last_name = possible_name_list[0]
|
||||||
|
contact.first_name = possible_name_list[1]
|
||||||
|
cdef int min_len = 0
|
||||||
|
if len(contact.last_name) > min_len and len(contact.first_name) > min_len:
|
||||||
|
generate_contacts.append(contact)
|
||||||
|
|
||||||
|
def generate_contact_from_mail_list(_start_position: int, _end_position: int,
|
||||||
|
name_list_file_path="all_new_name_list.txt"):
|
||||||
|
db_manager = MongoDbManager()
|
||||||
|
mail_list = db_manager.get_destination_emails()[_start_position:_end_position]
|
||||||
|
print("mail_list size before filter is {}".format(len(mail_list)))
|
||||||
|
filter_already_validated_contacts(mail_list)
|
||||||
|
print("mail_list size after filter is {}".format(len(mail_list)))
|
||||||
|
cdef list generate_contacts = []
|
||||||
|
pinyin_name_list = read_pinyin_list_from_file(name_list_file_path)
|
||||||
|
# random.shuffle(pinyin_name_list)
|
||||||
|
with ThreadPoolExecutor(max_workers=len(mail_list)) as executor:
|
||||||
|
for mail in mail_list:
|
||||||
|
executor.submit(find_contact, generate_contacts, mail, pinyin_name_list)
|
||||||
|
write_new_contacts_to_excel(generate_contacts)
|
||||||
|
|
||||||
|
def filter_already_validated_contacts(_list_to_extract):
|
||||||
|
_already_validated_contact_list = MONGO_STORE_MANAGER.get_all_contacts_to_book()
|
||||||
|
for _validated_contact in _already_validated_contact_list:
|
||||||
|
for _extracted_contact in _list_to_extract:
|
||||||
|
if _validated_contact.mail == _extracted_contact.mail:
|
||||||
|
_list_to_extract.remove(_extracted_contact)
|
||||||
@@ -18,18 +18,10 @@ def get_ordered_combins(stuff):
|
|||||||
return list_to_return
|
return list_to_return
|
||||||
|
|
||||||
|
|
||||||
def get_better_list(list):
|
|
||||||
return list
|
|
||||||
# for name in list:
|
|
||||||
# if len(name) == 2:
|
|
||||||
# list.remove(name)
|
|
||||||
# return list
|
|
||||||
|
|
||||||
|
|
||||||
# 关键词提取
|
# 关键词提取
|
||||||
|
|
||||||
def read_pinyin_list_from_file() -> list:
|
def read_pinyin_list_from_file(name_list_file_path="all_new_name_list.txt") -> list:
|
||||||
file2 = open('all_new_name_list.txt', 'r')
|
file2 = open(name_list_file_path, 'r')
|
||||||
lines = file2.readlines()
|
lines = file2.readlines()
|
||||||
name_list = []
|
name_list = []
|
||||||
count = 0
|
count = 0
|
||||||
@@ -210,8 +202,8 @@ def write_new_contacts_to_excel(valid_contacts: list):
|
|||||||
def find_contact(generate_contacts: list, mail, pinyin_name_list):
|
def find_contact(generate_contacts: list, mail, pinyin_name_list):
|
||||||
contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="",
|
contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="",
|
||||||
store="random")
|
store="random")
|
||||||
spliteed = mail.mail.split("@")
|
splitted = mail.mail.split("@")
|
||||||
possible_name_list = generate_name_from_email(spliteed[0], pinyin_name_list)
|
possible_name_list = generate_name_from_email(splitted[0], pinyin_name_list)
|
||||||
if possible_name_list is not None:
|
if possible_name_list is not None:
|
||||||
contact.last_name = possible_name_list[0]
|
contact.last_name = possible_name_list[0]
|
||||||
contact.first_name = possible_name_list[1]
|
contact.first_name = possible_name_list[1]
|
||||||
@@ -219,19 +211,15 @@ def find_contact(generate_contacts: list, mail, pinyin_name_list):
|
|||||||
generate_contacts.append(contact)
|
generate_contacts.append(contact)
|
||||||
|
|
||||||
|
|
||||||
def generate_contact_from_mail_list(_start_position, _end_position):
|
def generate_contact_from_mail_list(_start_position: int, _end_position: int,
|
||||||
|
name_list_file_path="all_new_name_list.txt"):
|
||||||
db_manager = MongoDbManager()
|
db_manager = MongoDbManager()
|
||||||
# mail_list = db_manager.get_destination_emails()[8501:8520]
|
|
||||||
# mail_list = db_manager.get_destination_emails()[8521:8600]
|
|
||||||
mail_list = db_manager.get_destination_emails()[_start_position:_end_position]
|
mail_list = db_manager.get_destination_emails()[_start_position:_end_position]
|
||||||
print("mail_list size before filter is {}".format(len(mail_list)))
|
print("mail_list size before filter is {}".format(len(mail_list)))
|
||||||
filter_already_validated_contacts(mail_list)
|
filter_already_validated_contacts(mail_list)
|
||||||
print("mail_list size after filter is {}".format(len(mail_list)))
|
print("mail_list size after filter is {}".format(len(mail_list)))
|
||||||
# mail_list = db_manager.get_destination_emails()[9323:9914]
|
|
||||||
# excel_reader = ExcelHelper()
|
|
||||||
# mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/toExtract.xlsx")
|
|
||||||
generate_contacts = []
|
generate_contacts = []
|
||||||
pinyin_name_list = read_pinyin_list_from_file()
|
pinyin_name_list = read_pinyin_list_from_file(name_list_file_path)
|
||||||
random.shuffle(pinyin_name_list)
|
random.shuffle(pinyin_name_list)
|
||||||
with ThreadPoolExecutor(max_workers=len(mail_list)) as executor:
|
with ThreadPoolExecutor(max_workers=len(mail_list)) as executor:
|
||||||
for mail in mail_list:
|
for mail in mail_list:
|
||||||
@@ -251,9 +239,9 @@ def filter_already_validated_contacts(_list_to_extract):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
start_position = 1
|
start_position = 1
|
||||||
end_position = 502
|
end_position = 3
|
||||||
# x = range(start_position, end_position, 100)
|
# x = range(start_position, end_position, 100)
|
||||||
step = 500
|
step = 1
|
||||||
step_list = range(start_position, end_position, step)
|
step_list = range(start_position, end_position, step)
|
||||||
print(step_list[-1])
|
print(step_list[-1])
|
||||||
for x in step_list:
|
for x in step_list:
|
||||||
@@ -263,7 +251,8 @@ if __name__ == '__main__':
|
|||||||
print("_from_position is {},_end_position is {}".format(_from_position, _end_position))
|
print("_from_position is {},_end_position is {}".format(_from_position, _end_position))
|
||||||
if _end_position <= step_list[-1]:
|
if _end_position <= step_list[-1]:
|
||||||
print("start extraction from {} to {}".format(_from_position, _end_position))
|
print("start extraction from {} to {}".format(_from_position, _end_position))
|
||||||
generate_contact_from_mail_list(_from_position, _end_position)
|
generate_contact_from_mail_list(_from_position, _end_position,
|
||||||
|
name_list_file_path="/Users/lpan/Documents/workspace/appointment_tool/src/person_name/all_new_name_list.txt")
|
||||||
else:
|
else:
|
||||||
print("stop with {},{}".format(_from_position, _end_position))
|
print("stop with {},{}".format(_from_position, _end_position))
|
||||||
# get_maximum_length()
|
# get_maximum_length()
|
||||||
|
|||||||
@@ -0,0 +1,22 @@
|
|||||||
|
from cython_extract_methods import generate_contact_from_mail_list
|
||||||
|
|
||||||
|
# from extract_name_with_pinyinlist import generate_contact_from_mail_list
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
start_position = 1
|
||||||
|
end_position = 202
|
||||||
|
# x = range(start_position, end_position, 100)
|
||||||
|
step = 200
|
||||||
|
step_list = range(start_position, end_position, step)
|
||||||
|
print(step_list[-1])
|
||||||
|
for x in step_list:
|
||||||
|
print(x)
|
||||||
|
_from_position = x
|
||||||
|
_end_position = x + step
|
||||||
|
print("_from_position is {},_end_position is {}".format(_from_position, _end_position))
|
||||||
|
if _end_position <= step_list[-1]:
|
||||||
|
print("start extraction from {} to {}".format(_from_position, _end_position))
|
||||||
|
generate_contact_from_mail_list(_from_position, _end_position,
|
||||||
|
name_list_file_path="/Users/lpan/Documents/workspace/appointment_tool/src/person_name/all_new_name_list.txt")
|
||||||
|
else:
|
||||||
|
print("stop with {},{}".format(_from_position, _end_position))
|
||||||
@@ -18,6 +18,7 @@ def get_contact_list() -> list:
|
|||||||
_contact = ContactPojo(first_name=item.first_name, last_name=item.last_name, mail=item.mail,
|
_contact = ContactPojo(first_name=item.first_name, last_name=item.last_name, mail=item.mail,
|
||||||
phone_number=item.phone, passport_number=item.passport, store=item.store)
|
phone_number=item.phone, passport_number=item.passport, store=item.store)
|
||||||
_contact.ip_address = item.ip_address
|
_contact.ip_address = item.ip_address
|
||||||
|
_contact.created_at = item.created_at
|
||||||
_contact_list.append(_contact)
|
_contact_list.append(_contact)
|
||||||
return _contact_list
|
return _contact_list
|
||||||
|
|
||||||
@@ -39,7 +40,7 @@ def write_contact_with_ip_info_to_file(contact_list):
|
|||||||
col = 0
|
col = 0
|
||||||
# Create a workbook and add a worksheet.
|
# Create a workbook and add a worksheet.
|
||||||
workbook = xlsxwriter.Workbook('ip_info_{}.xlsx'.format(len(contact_list)))
|
workbook = xlsxwriter.Workbook('ip_info_{}.xlsx'.format(len(contact_list)))
|
||||||
header_data = ['name', 'email', 'isp', 'ip_address']
|
header_data = ['name', 'email', 'isp', 'ip_address', 'created_at']
|
||||||
worksheet = workbook.add_worksheet()
|
worksheet = workbook.add_worksheet()
|
||||||
header_format = workbook.add_format({'bold': True})
|
header_format = workbook.add_format({'bold': True})
|
||||||
|
|
||||||
@@ -52,6 +53,7 @@ def write_contact_with_ip_info_to_file(contact_list):
|
|||||||
worksheet.write(row, col + 1, info.mail)
|
worksheet.write(row, col + 1, info.mail)
|
||||||
worksheet.write(row, col + 2, info.isp)
|
worksheet.write(row, col + 2, info.isp)
|
||||||
worksheet.write(row, col + 3, info.ip_address)
|
worksheet.write(row, col + 3, info.ip_address)
|
||||||
|
worksheet.write(row, col + 4, info.created_at)
|
||||||
row += 1
|
row += 1
|
||||||
workbook.close()
|
workbook.close()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user