diff --git a/src/mail/check_email_existence.py b/src/mail/check_email_existence.py old mode 100644 new mode 100755 index 0995b03..f24d575 --- a/src/mail/check_email_existence.py +++ b/src/mail/check_email_existence.py @@ -1,9 +1,9 @@ from src.db.mongo_manager import MONGO_STORE_MANAGER -from src.utils.excel_reader import ExcelHelper +from src.utils.excel_reader import read_contacts +# 检查联系人表的有邮件有没有在数据库中 if __name__ == '__main__': - excel_reader = ExcelHelper() - contacts = excel_reader.read_contacts("/Users/lpan/Desktop/yahoo_aol.xlsx") + contacts = read_contacts("/Users/panlei/Desktop/yahoo_aol_valid_16.xlsx") mail_list = MONGO_STORE_MANAGER.get_destination_emails() print("mail_list size is " + str(len(mail_list))) mail_raw_list =[] diff --git a/src/person_name/contact_manager.py b/src/person_name/contact_manager.py index 48ddcbd..59acb43 100755 --- a/src/person_name/contact_manager.py +++ b/src/person_name/contact_manager.py @@ -1,11 +1,10 @@ from src.db.mongo_manager import MONGO_STORE_MANAGER -from src.utils.excel_reader import ExcelHelper +from src.utils.excel_reader import read_contacts def upload_contacts_list(): - excel_helper = ExcelHelper() - contacts_to_book = excel_helper.read_contacts("/Users/lpan/Desktop/yahoo_aol.xlsx") - return contacts_to_book + _contacts_to_book = read_contacts("/Users/panlei/Desktop/yahoo_aol_valid_25.xlsx") + return _contacts_to_book # 把新的联系人存到网上 diff --git a/src/person_name/extract_name_with_pinyinlist.py b/src/person_name/extract_name_with_pinyinlist.py old mode 100644 new mode 100755 index bc99c38..fd7139e --- a/src/person_name/extract_name_with_pinyinlist.py +++ b/src/person_name/extract_name_with_pinyinlist.py @@ -7,7 +7,7 @@ import xlsxwriter from src.db.mongo_manager import MongoDbManager from src.pojo.contact_pojo import ContactPojo from src.utils.excel_reader import get_random_fr_phone_numbers, ExcelHelper -from src.utils.generate_random_passport_id import get_random_passport_id_number +from src.utils.contacts.generate_random_passport_id import get_random_passport_id_number def get_ordered_combins(stuff): @@ -85,7 +85,7 @@ def has_numbers(inputString): def check_name(word_to_test, pinyin_name_list): - if "_" in word_to_test or "." in word_to_test: + if "_" in word_to_test or "." in word_to_test or "v" in word_to_test: return None if has_numbers(word_to_test): return None @@ -149,7 +149,7 @@ def generate_name_from_email(mail_address, pinyin_name_list): # found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list) # if found_name is not None: # return found_name - if len(word_to_test) >= 1: + if len(word_to_test) >= 5: found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list) if found_name is not None: return found_name @@ -200,10 +200,11 @@ def find_contact(generate_contacts: list, mail, pinyin_name_list): def generate_contact_from_mail_list(): db_manager = MongoDbManager() # mail_list = db_manager.get_destination_emails()[6000:7000] - # mail_list = db_manager.get_destination_emails()[3001:3200] - # mail_list = db_manager.get_destination_emails()[7570:7590] + # mail_list = db_manager.get_destination_emails()[7000:7500] + # mail_list = db_manager.get_destination_emails()[2201:2400] + # mail_list = db_manager.get_destination_emails()[7080:7181] excel_reader = ExcelHelper() - mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/mails/gmail_19_05.xlsx") + mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/panlei/Desktop/toExtract.xlsx") generate_contacts = [] pinyin_name_list = read_pinyin_list_from_file() random.shuffle(pinyin_name_list) diff --git a/src/utils/contacts/check_contact_name.py b/src/utils/contacts/check_contact_name.py new file mode 100644 index 0000000..08fc84a --- /dev/null +++ b/src/utils/contacts/check_contact_name.py @@ -0,0 +1,7 @@ +from src.utils.excel_reader import read_contacts + +_contact_list = read_contacts(file_name="/Users/panlei/Desktop/yahoo_aol_valid_23.xlsx") +# print(_contact_list) +for _contact in _contact_list: + if _contact.last_name is None or _contact.first_name is None: + print(_contact.mail) \ No newline at end of file diff --git a/src/utils/generate_random_passport_id.py b/src/utils/contacts/generate_random_passport_id.py old mode 100644 new mode 100755 similarity index 92% rename from src/utils/generate_random_passport_id.py rename to src/utils/contacts/generate_random_passport_id.py index 3b4f189..8c7d9e2 --- a/src/utils/generate_random_passport_id.py +++ b/src/utils/contacts/generate_random_passport_id.py @@ -57,7 +57,7 @@ def generate_titre_sejour_number(size=10) -> list: if __name__ == '__main__': # for i in range(1,200): # print(get_random_id_number()) - # for i in range(1, 501): - # print(get_random_passport_id_number()) - for id in generate_titre_sejour_number(3200): - print(id) + for i in range(1, 101): + print(get_random_passport_id_number()) + # for id in generate_titre_sejour_number(3200): + # print(id) diff --git a/src/utils/export_registered_users.py b/src/utils/export_registered_users.py new file mode 100644 index 0000000..8d72045 --- /dev/null +++ b/src/utils/export_registered_users.py @@ -0,0 +1,63 @@ +import xlsxwriter + +from src.db.mongo_manager import MONGO_STORE_MANAGER +from src.utils.excel_reader import read_contacts + + +class ExportedUser: + def __init__(self, last_name, first_name, password, mail, phone): + self.last_name = last_name + self.first_name = first_name + self.password = password + self.mail = mail + self.phone = phone + + +def export_registered_users() -> list: + _user_list = MONGO_STORE_MANAGER.get_all_registered_users() + print(len(_user_list)) + return _user_list + # for _user in _user_list: + # print(_user) + + +def read_contacts_to_check(): + _contact_list = read_contacts(file_name="/Users/panlei/Desktop/check_names.xlsx") + return _contact_list + + +def write_to_excel(_to_export_list): + row = 0 + col = 0 + # Create a workbook and add a worksheet. + workbook = xlsxwriter.Workbook('check_names_of_contacts_{}.xlsx'.format(len(_to_export_list))) + header_data = ['Nom/姓', 'Prénom/名', 'phone/电话', 'mail/邮箱', 'mot de pass/密码'] + worksheet = workbook.add_worksheet() + header_format = workbook.add_format({'bold': True}) + + for col_num, data in enumerate(header_data): + worksheet.write(row, col_num, data, header_format) + row = row + 1 + for info in _to_export_list: + info.phone = info.phone + # Iterate over the data and write it out row by row. + worksheet.write(row, col, info.last_name) + worksheet.write(row, col + 1, info.first_name) + worksheet.write(row, col + 2, "0"+info.phone) + worksheet.write(row, col + 3, info.mail) + worksheet.write(row, col + 4, info.password) + row += 1 + workbook.close() + + +if __name__ == '__main__': + _user_list = export_registered_users() + contact_list = read_contacts_to_check() + _to_export = [] + for _user in _user_list: + for contact in contact_list: + if _user.mail == contact.mail: + _export_user = ExportedUser(contact.last_name, contact.first_name, _user.password, contact.mail, + contact.phone) + _to_export.append(_export_user) + write_to_excel(_to_export) diff --git a/src/utils/extract_name.py b/src/utils/extract_name.py deleted file mode 100644 index 63c795d..0000000 --- a/src/utils/extract_name.py +++ /dev/null @@ -1,107 +0,0 @@ -import itertools - -import xlsxwriter -from pyhanlp import * -from Pinyin2Hanzi import DefaultHmmParams -from Pinyin2Hanzi import viterbi -from itertools import combinations - -from src.db.mongo_manager import MongoDbManager -from src.pojo.contact_pojo import ContactPojo -from src.utils.excel_reader import get_random_phone_numbers, generate_email_from_name -from src.utils.generate_random_passport_id import get_random_passport_id_number - - -def get_ordered_combins(stuff): - list_to_return = [] - for i, j in itertools.combinations(range(len(stuff) + 1), 2): - print(stuff[i:j]) - list_to_return.append(stuff[i:j]) - return list_to_return - - -def get_better_list(list): - return list - # for name in list: - # if len(name) == 2: - # list.remove(name) - # return list - - -# 关键词提取 - - -def generate_name_from_email(mail_address): - key_words = HanLP.extractKeyword(mail_address, 2) - print(key_words) - hmmparams = DefaultHmmParams() - pinyin_name_list = [] - chinese_name_list = [] - setence = "".join(key_words) - all_combins = get_ordered_combins(setence) - for i in all_combins: - word_to_test = "".join(i) - if len(word_to_test) >= 2: - # print("word to test is " + word_to_test) - try: - result = viterbi(hmm_params=hmmparams, observations=(word_to_test,), path_num=2) - for item in result: - print(item.score, item.path) - chinese_name_list.extend(item.path) - # print("word is " + word_to_test) - # if len(word_to_test) >= 3: - # all_combins.remove(i) - pinyin_name_list.append(word_to_test) - except Exception as error: - print(error) - - print(pinyin_name_list) - # 选择不重复的 - # if len(pinyin_name_list) > 3: - # return get_better_list(pinyin_name_list) - # else: - return pinyin_name_list, chinese_name_list - - -def write_new_contacts_to_excel(valid_contacts: list): - row = 0 - col = 0 - # Create a workbook and add a worksheet. - workbook = xlsxwriter.Workbook('real_name_contacts_{}.xlsx'.format(len(valid_contacts))) - header_data = ['name', 'phone', 'passport', 'email', 'note'] - worksheet = workbook.add_worksheet() - header_format = workbook.add_format({'bold': True}) - - for col_num, data in enumerate(header_data): - worksheet.write(row, col_num, data, header_format) - row = row + 1 - for info in valid_contacts: - info.phone = get_random_phone_numbers() - info.passport = get_random_passport_id_number() - # Iterate over the data and write it out row by row. - worksheet.write(row, col, "{} {}".format(info.last_name, info.first_name)) - worksheet.write(row, col + 1, info.phone) - worksheet.write(row, col + 2, info.passport) - worksheet.write(row, col + 3, info.mail) - worksheet.write(row, col + 4, info.note) - row += 1 - workbook.close() - - -if __name__ == '__main__': - db_manager = MongoDbManager() - mail_list = db_manager.get_destination_emails()[501:1000] - # mail_list = db_manager.get_destination_emails()[50:200] - generate_contacts = [] - for mail in mail_list: - contact = ContactPojo(mail=mail.mail, phone_number="", passport_number="", last_name="", first_name="") - spliteed = mail.mail.split("@") - possible_name_list = generate_name_from_email(spliteed[0])[0] - chinese_name_list = generate_name_from_email(spliteed[0])[1] - if len(possible_name_list) >= 2: - contact.last_name = possible_name_list[0] - contact.first_name = "".join(possible_name_list[1:-1]) - contact.note = " ".join(chinese_name_list) - if len(contact.last_name) > 0 and len(contact.first_name) > 0: - generate_contacts.append(contact) - write_new_contacts_to_excel(generate_contacts) diff --git a/src/utils/generate_graphe_from_excel.py b/src/utils/generate_graphe_from_excel.py new file mode 100644 index 0000000..1cd0b85 --- /dev/null +++ b/src/utils/generate_graphe_from_excel.py @@ -0,0 +1,76 @@ +import colorsys +import json +import random + +import numpy as np +import pandas +import matplotlib.patches as mpatches +from matplotlib import pyplot as plt + +from src.pojo.ReserveResultPojo import ReserveResultPojo + + +def read_reserve_pojo_from_excel(file_path: str): + print("read file " + file_path) + contact_list_in_json = pandas.read_excel(file_path) + # .to_json(orient='records')) + plot_bargraph_with_groupings(contact_list_in_json, "source_from", "source_from", "分布", "型号", "数量") + # contact_dict_list = json.loads(contact_list_in_json) + # contact_list = [] + # for contact_dict in contact_dict_list: + # last_name = contact_dict['last_name'] + # first_name = contact_dict['first_name'] + # phone_number = contact_dict['phone'] + # source_from = contact_dict['source_from'] + # contact = ReserveResultPojo() + # contact.lastName = last_name + # contact.firstName = first_name + # contact.phone = phone_number + # contact.source_from = source_from + # contact_list.append(contact) + # return contact_list + + +def plot_bargraph_with_groupings(df, groupby, colourby, title, xlabel, ylabel): + """ + Plots a dataframe showing the frequency of datapoints grouped by one column and coloured by another. + df : dataframe + groupby: the column to groupby + colourby: the column to color by + title: the graph title + xlabel: the x label, + ylabel: the y label + """ + randHSVcolors = [(np.random.rand(), 1, 1) for i in range(len(df[colourby].unique()))] + # Convert HSV list to RGB + randRGBcolors = [] + for HSVcolor in randHSVcolors: + randRGBcolors.append(colorsys.hsv_to_rgb(HSVcolor[0], HSVcolor[1], HSVcolor[2])) + # Makes a mapping from the unique colourby column items to a random color. + ind_col_map = {x: y for x, y in zip(df[colourby].unique(), + randRGBcolors)} + + # Find when the indicies of the soon to be bar graphs colors. + unique_comb = df[[groupby, colourby]].drop_duplicates() + name_ind_map = {x: y for x, y in zip(unique_comb[groupby], unique_comb[colourby])} + values_counts = df[groupby].value_counts() + c = values_counts.index.map(lambda x: ind_col_map[name_ind_map[x]]) + + # Makes the bargraph. + ax = df[groupby].value_counts().plot(kind='bar', + # figsize=FIG_SIZE, + title=title, + color=[c.values]) + # Makes a legend using the ind_col_map + legend_list = [] + for key in ind_col_map.keys(): + legend_list.append(mpatches.Patch(color=ind_col_map[key], label=key)) + + # display the graph. + plt.legend(handles=legend_list) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + + +if __name__ == '__main__': + read_reserve_pojo_from_excel("/Users/panlei/2023_10_27.xlsx")