From d4febdf96cd6601f7048e9de4665229943de8fd0 Mon Sep 17 00:00:00 2001 From: PAN Lei Date: Wed, 17 May 2023 12:16:10 +0200 Subject: [PATCH] can read links excel --- src/db/mongo_manager.py | 19 ++++++++++----- src/mail/MailManager.py | 10 ++++---- src/mail/mail_reader.py | 8 ++++--- .../extract_name_with_pinyinlist.py | 22 ++++++++++++----- src/pojo/ReserveResultPojo.py | 2 ++ src/utils/excel_reader.py | 24 +++++++++++++------ 6 files changed, 59 insertions(+), 26 deletions(-) diff --git a/src/db/mongo_manager.py b/src/db/mongo_manager.py index f4a949c..5400fe1 100644 --- a/src/db/mongo_manager.py +++ b/src/db/mongo_manager.py @@ -259,12 +259,19 @@ class MongoDbManager: def save_links_to_validate(self, link: str, mail_address: str): collection_to_use = self.db[LINKS_TO_VALIDATE] updated_at = time.strftime("%H:%M:%S", time.localtime()) - collection_to_use.replace_one(filter={'_id': link, }, replacement={ - u'url': link, - u'email': mail_address, - "updated_at": updated_at - }, - upsert=True) + if len(mail_address) > 0: + collection_to_use.replace_one(filter={'_id': link, }, replacement={ + u'url': link, + u'email': mail_address, + "updated_at": updated_at + }, + upsert=True) + else: + collection_to_use.replace_one(filter={'_id': link, }, replacement={ + u'url': link, + "updated_at": updated_at + }, + upsert=True) MONGO_STORE_MANAGER = MongoDbManager() diff --git a/src/mail/MailManager.py b/src/mail/MailManager.py index 21e33cb..075f51b 100644 --- a/src/mail/MailManager.py +++ b/src/mail/MailManager.py @@ -13,17 +13,19 @@ class MailManager: filter(lambda filtered_item: filtered_item.url_validated is None or filtered_item.url_validated is False, successful_items)) for item in not_validated_items: - if "@gmail" not in item.email and "@163" not in item.email: + if "@gmail" not in item.email and "@163" not in item.email and "@hotmail" not in item.email: invalid_contacts.append( ContactPojo(phone_number=item.phone, passport_number=item.passport, last_name=item.lastName, first_name=item.firstName, mail=item.email, )) return invalid_contacts + def get_valid_emails_for_day(self): valid_contacts = [] successful_items = MONGO_STORE_MANAGER.get_all_successful_items_for_day() validated_items = list( - filter(lambda filtered_item: filtered_item.url_validated is not None and filtered_item.url_validated is True, - successful_items)) + filter( + lambda filtered_item: filtered_item.url_validated is not None and filtered_item.url_validated is True, + successful_items)) for item in validated_items: if "@aol" in item.email: valid_contacts.append( @@ -56,4 +58,4 @@ class MailManager: if __name__ == '__main__': manager = MailManager() manager.write_invalid_contacts_to_excel(manager.get_invalid_emails_for_day()) - manager.write_invalid_contacts_to_excel(manager.get_valid_emails_for_day()) + # manager.write_invalid_contacts_to_excel(manager.get_valid_emails_for_day()) diff --git a/src/mail/mail_reader.py b/src/mail/mail_reader.py index 0f2be15..34d826b 100644 --- a/src/mail/mail_reader.py +++ b/src/mail/mail_reader.py @@ -199,9 +199,9 @@ class MailReader(): def need_to_valid_url(url: str, successful_items) -> bool: - # return True - if len(successful_items) == 0: - return False + return True + # if len(successful_items) == 0: + # return False print("url is :" + url) parts = url.split('/') id = parts[5] @@ -214,6 +214,8 @@ def need_to_valid_url(url: str, successful_items) -> bool: return not item.url_validated else: # if url_validated is None + if item.url_validated is not None: + return not item.url_validated return True return True else: diff --git a/src/person_name/extract_name_with_pinyinlist.py b/src/person_name/extract_name_with_pinyinlist.py index cbeb435..ad383c6 100644 --- a/src/person_name/extract_name_with_pinyinlist.py +++ b/src/person_name/extract_name_with_pinyinlist.py @@ -38,6 +38,8 @@ def read_pinyin_list_from_file() -> list: print("Line{}: {}".format(count, line.strip())) name_list.append(line.strip()) return name_list + + def read_pinyin_first_name_from_file() -> list: file2 = open('first_name_noDuplicates.txt', 'r') lines = file2.readlines() @@ -49,6 +51,7 @@ def read_pinyin_first_name_from_file() -> list: name_list.append(line.strip()) return name_list + def read_pinyin_last_name_from_file() -> list: file2 = open('last_name_noDuplicates.txt', 'r') lines = file2.readlines() @@ -76,8 +79,11 @@ def generate_new_list_from_old_name_list(): f.write(str(new_name)) f.close() + def has_numbers(inputString): return any(char.isdigit() for char in inputString) + + def check_name(word_to_test, pinyin_name_list): if "_" in word_to_test or "." in word_to_test: return None @@ -106,9 +112,13 @@ def generate_name_from_email(mail_address, pinyin_name_list): print("generate for " + mail_address) all_combins = get_ordered_combins(mail_address) all_combins.sort(key=len, reverse=True) + no_duplicated_list = [] + for word in all_combins: + if word not in no_duplicated_list: + no_duplicated_list.append(word) print(all_combins) - for i in all_combins: + for i in no_duplicated_list: word_to_test = "".join(i) print("word to test is " + word_to_test) # if len(word_to_test) >= 11: @@ -139,7 +149,7 @@ def generate_name_from_email(mail_address, pinyin_name_list): # found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list) # if found_name is not None: # return found_name - if len(word_to_test) >= 4: + if len(word_to_test) >= 1: found_name = check_name(word_to_test=word_to_test, pinyin_name_list=pinyin_name_list) if found_name is not None: return found_name @@ -191,13 +201,13 @@ def generate_contact_from_mail_list(): db_manager = MongoDbManager() # mail_list = db_manager.get_destination_emails()[6000:7000] # mail_list = db_manager.get_destination_emails()[3001:3200] - mail_list = db_manager.get_destination_emails()[7570:7590] - # excel_reader = ExcelHelper() - # mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/yahoo_list.xlsx") + # mail_list = db_manager.get_destination_emails()[7570:7590] + excel_reader = ExcelHelper() + mail_list = excel_reader.read_mails_and_pwd(file_name="/Users/lpan/Desktop/mails/163_list_60.xlsx") generate_contacts = [] pinyin_name_list = read_pinyin_list_from_file() random.shuffle(pinyin_name_list) - with ThreadPoolExecutor(max_workers=500) as executor: + with ThreadPoolExecutor(max_workers=len(mail_list)) as executor: for mail in mail_list: executor.submit(find_contact, generate_contacts, mail, pinyin_name_list) # for mail in mail_list: diff --git a/src/pojo/ReserveResultPojo.py b/src/pojo/ReserveResultPojo.py index 8a69709..54939e5 100644 --- a/src/pojo/ReserveResultPojo.py +++ b/src/pojo/ReserveResultPojo.py @@ -53,6 +53,8 @@ class ReserveResultPojo: url = "" if 'id' in source: id = source['id'] + if '_id' in source: + id = source['_id'] else: id = "" if 'email' in source: diff --git a/src/utils/excel_reader.py b/src/utils/excel_reader.py index f7eb231..eaa5be7 100644 --- a/src/utils/excel_reader.py +++ b/src/utils/excel_reader.py @@ -3,6 +3,7 @@ import random import string import pandas as pandas +import validators as validators import xlsxwriter from src.config import CONTACT_LIST_FILE @@ -14,16 +15,24 @@ from src.utils.generate_random_passport_id import get_random_passport_id_number phone_number_prefix = ['6'] +def read_links_to_click(file_path): + links_info_in_json = pandas.read_excel(file_path).to_json(orient='records') + # print(links_info_in_json) + for item in json.loads(links_info_in_json): + link = item['link'] + if validators.url(link): + print(link) + MONGO_STORE_MANAGER.save_links_to_validate(link, "") + + else: + print("error on link " + link) + + class ExcelHelper: def __init__(self): self._df = pandas.Series() - def write_to_exel(self, file_name, data_list: list): - new_df = pandas.Series(data_list) - self._df = pandas.concat([self._df, new_df]) - self._df.to_excel(file_name) - def read_user_agens(self) -> list: user_agent_in_json = pandas.read_excel( "/Users/lpan/Documents/workspace/appointment_tool/docs/mobile_user_agent_list.xlsx").to_json( @@ -231,8 +240,9 @@ if __name__ == '__main__': # contacts = excel_reader.read_names("/Users/lpan/Downloads/gmail_10.xlsx") # print(contacts) # write_new_contacts_to_excel(valid_contacts=contacts) - excel_reader = ExcelHelper() - excel_reader.check_contact_list("/Users/lpan/Desktop/contact_email_valid.xlsx") + # excel_reader = ExcelHelper() + # excel_reader.check_contact_list("/Users/lpan/Desktop/contact_email_valid.xlsx") + read_links_to_click("/Users/lpan/Downloads/链接.xlsx") # save_mails_to_db() # for mail in excel_reader.read_mails_and_pwd(): # MONGO_STORE_MANAGER.insert_email(mail)