diff --git a/db/mongo_manager.py b/db/mongo_manager.py index 9ccf1d4..47068b5 100755 --- a/db/mongo_manager.py +++ b/db/mongo_manager.py @@ -2,6 +2,7 @@ import datetime import logging import time import os +from typing import Optional from pymongo import MongoClient @@ -22,6 +23,7 @@ DESTINATION_EMAIL_LIST = "DESTINATION_EMAIL_LIST" LINKS_TO_VALIDATE = "LINKS_TO_VALIDATE" INVALID_EMAIL_LIST = "INVALID_EMAIL_LIST" CONTACT_LIST_SERIAL_MAP = "CONTACT_LIST_SERIAL_MAP" +MAIL_READ_LOG = "MAIL_READ_LOG" # 记录每个邮箱上次读取时间 class MongoDbManager: @@ -264,5 +266,29 @@ class MongoDbManager: collection_to_use = self.db[LINKS_TO_VALIDATE] collection_to_use.delete_one({"_id": linkPojo.email}) + # ── Mail read-time tracking ──────────────────────────────────── + + def get_last_mail_read_time(self, mail: str) -> Optional[datetime.datetime]: + """返回指定邮箱上次被读取的 UTC 时间,若从未读取则返回 None。""" + try: + doc = self.db[MAIL_READ_LOG].find_one({"_id": mail}) + if doc and "last_read_at" in doc: + return doc["last_read_at"] + except Exception as err: + self.logger.warning("get_last_mail_read_time error: %s", err) + return None + + def update_mail_read_time(self, mail: str) -> None: + """将指定邮箱的上次读取时间更新为当前 UTC 时间。""" + try: + self.db[MAIL_READ_LOG].replace_one( + {"_id": mail}, + {"_id": mail, "last_read_at": datetime.datetime.utcnow()}, + upsert=True, + ) + except Exception as err: + self.logger.warning("update_mail_read_time error: %s", err) + MONGO_STORE_MANAGER = MongoDbManager() + diff --git a/mail/__init__.py b/mail/__init__.py index e69de29..fc66ac1 100644 --- a/mail/__init__.py +++ b/mail/__init__.py @@ -0,0 +1,66 @@ +""" +mail/ +===== +Package de lecture IMAP pour le bot appointment_request. + +Architecture (du plus bas au plus haut niveau) : + + imap_proxy_reader — bibliothèque de bas niveau, sans dépendances internes + ProxyConfig dataclass de configuration proxy (SOCKS5/SOCKS4/HTTP) + ProxyIMAPClient IMAPClient passant par un proxy + get_imap_server(login) résolution domaine → serveur IMAP + extract_body(msg) extraction HTML/texte d'un email + send_imap_id(imap) spoofing fingerprint client (RFC 2971) + VALIDATION_URL_* constantes Hermes (source de vérité) + IMAP_SERVER_MAP table domaine → serveur + + mail_constants IMAPClient avec fingerprint + fabrique create_imap() + FingerprintIMAPClient IMAPClient auto-ID après login + create_imap(login) fabrique → FingerprintIMAPClient sur le bon serveur + show_folders(imap) liste des dossiers (IMAPClient ou imaplib) + + mail_reader_all_contacts logique métier de haut niveau + MailReader lit les emails d'un compte (direct ou proxy) + find_links_to_validate_from_mail_list() point d'entrée principal +""" + +from mail.imap_proxy_reader import ( + ProxyConfig, + ProxyIMAPClient, + get_imap_server, + extract_body, + send_imap_id, + VALIDATION_URL_SUBJECT_FR, + VALIDATION_URL_SUBJECT_EN, + VALIDATION_URL_REGEX, + IMAP_SERVER_MAP, +) +from mail.mail_constants import ( + FingerprintIMAPClient, + create_imap, + show_folders, +) +from mail.mail_reader_all_contacts import ( + MailReader, + find_links_to_validate_from_mail_list, +) + +__all__ = [ + # imap_proxy_reader + "ProxyConfig", + "ProxyIMAPClient", + "get_imap_server", + "extract_body", + "send_imap_id", + "VALIDATION_URL_SUBJECT_FR", + "VALIDATION_URL_SUBJECT_EN", + "VALIDATION_URL_REGEX", + "IMAP_SERVER_MAP", + # mail_constants + "FingerprintIMAPClient", + "create_imap", + "show_folders", + # mail_reader_all_contacts + "MailReader", + "find_links_to_validate_from_mail_list", +] diff --git a/mail/imap_proxy_reader.py b/mail/imap_proxy_reader.py index f919512..0a19de0 100644 --- a/mail/imap_proxy_reader.py +++ b/mail/imap_proxy_reader.py @@ -26,6 +26,8 @@ from dataclasses import dataclass, field from email.message import Message from typing import List, Optional, Tuple +import random + import socks from dotenv import load_dotenv from imapclient import IMAPClient @@ -40,7 +42,7 @@ VALIDATION_URL_SUBJECT_FR = "Validation de votre demande de rendez-vous" VALIDATION_URL_SUBJECT_EN = "Please confirm your appointment request" VALIDATION_URL_REGEX = ( r"https:\/\/rendezvousparis\.hermes\.com" - r"\/client\/register\/[A-Z0-9]+\/validate\.code=[A-Z0-9]+" + r"\/client\/register\/[A-Z0-9]+\/validate[?.]code=[A-Z0-9]+" ) DATE_FORMAT = "%d-%b-%Y" @@ -198,11 +200,122 @@ class ProxyIMAP4_TLS(imaplib.IMAP4): def shutdown(self) -> None: imaplib.IMAP4.shutdown(self) + def id(self, parameters: dict) -> tuple: + """ + Envoie la commande IMAP ID (RFC 2971). + parameters : dict ex. {"name": "Thunderbird", "version": "115.0"} + Retourne le tuple brut (typ, data) renvoyé par le serveur. + """ + args = " ".join( + '"{}"'.format(str(v).replace('"', '\\"')) + for pair in parameters.items() + for v in pair + ) + return self._simple_command("ID", "({})".format(args)) + # ────────────────────────────────────────────────────────────── -# IMAPClient avec proxy +# Profils de clients IMAP réels (pour spoofing du fingerprint) # ────────────────────────────────────────────────────────────── +_IMAP_CLIENT_PROFILES = [ + # Mozilla Thunderbird 115 (ESR) — Windows + { + "name": "Thunderbird", + "version": "115.9.0", + "vendor": "Mozilla", + "support-url": "https://support.mozilla.org/", + "command": "IMAP4rev1", + "os": "Windows NT 10.0", + "os-version": "10.0", + }, + # Mozilla Thunderbird 115 — macOS + { + "name": "Thunderbird", + "version": "115.9.0", + "vendor": "Mozilla", + "support-url": "https://support.mozilla.org/", + "command": "IMAP4rev1", + "os": "macOS", + "os-version": "14.4", + }, + # Apple Mail — macOS Sonoma + { + "name": "Mac OS X Mail", + "version": "16.0", + "vendor": "Apple Inc.", + "support-url": "https://support.apple.com/mail", + "os": "Mac OS X", + "os-version": "14.4", + }, + # Apple Mail — iOS + { + "name": "iPhone Mail", + "version": "17.4", + "vendor": "Apple Inc.", + "os": "iOS", + "os-version": "17.4", + }, + # Outlook pour Windows (MAPI/IMAP bridge) + { + "name": "Microsoft Outlook", + "version": "16.0.17531.20108", + "vendor": "Microsoft Corporation", + "support-url": "https://support.microsoft.com/outlook", + "os": "Windows NT 10.0", + "os-version": "10.0", + }, +] + + +def _random_imap_id_params() -> dict: + """Retourne un profil aléatoire parmi les clients IMAP réels.""" + return random.choice(_IMAP_CLIENT_PROFILES) + + +def send_imap_id(imap, params: Optional[dict] = None) -> None: + """ + Envoie la commande IMAP ID après connexion pour usurper le fingerprint + client. Fonctionne avec IMAPClient (imapclient) et imaplib.IMAP4. + + Paramètres + ---------- + imap : IMAPClient | imaplib.IMAP4 + params : dict, optional — si None, un profil aléatoire est choisi. + """ + if params is None: + params = _random_imap_id_params() + + try: + if isinstance(imap, IMAPClient): + # imapclient expose _imap (l'objet imaplib sous-jacent) + _raw = imap._imap + if hasattr(_raw, "id"): + _raw.id(params) + else: + # Fallback : commande brute via imapclient + args = " ".join( + '"{}"'.format(str(v).replace('"', '\\"')) + for pair in params.items() + for v in pair + ) + imap._imap._simple_command("ID", "({})".format(args)) + elif hasattr(imap, "id"): + # ProxyIMAP4_TLS ou tout imaplib.IMAP4 patchable + imap.id(params) + else: + # Dernier recours : commande brute imaplib + args = " ".join( + '"{}"'.format(str(v).replace('"', '\\"')) + for pair in params.items() + for v in pair + ) + imap._simple_command("ID", "({})".format(args)) + except Exception as exc: + logger.debug("IMAP ID non supporté ou ignoré : %s", exc) + + + class ProxyIMAPClient(IMAPClient): """ Sous-classe d'IMAPClient qui utilise un proxy SOCKS/HTTP. @@ -239,6 +352,12 @@ class ProxyIMAPClient(IMAPClient): "Utilisez ssl=True (port 993)." ) + def login(self, username: str, password: str): + """Surcharge login() pour envoyer IMAP ID juste après l'authentification.""" + result = super().login(username, password) + send_imap_id(self) + return result + # ────────────────────────────────────────────────────────────── # Fonctions utilitaires diff --git a/mail/mail_constants.py b/mail/mail_constants.py index b1156d2..3b66ca5 100755 --- a/mail/mail_constants.py +++ b/mail/mail_constants.py @@ -1,132 +1,102 @@ +""" +mail_constants.py +================= +Constantes de domaine email et fabrique d'instances IMAPClient. + +Architecture mail/ : + imap_proxy_reader ← bibliothèque de bas niveau (proxy, IMAP ID, extract_body, server map) + mail_constants ← cette couche : FingerprintIMAPClient + create_imap() pour les comptes directs + mail_reader_all_contacts ← logique métier haut niveau (MailReader, find_links_to_validate_…) +""" + import imaplib from imapclient import IMAPClient +from mail.imap_proxy_reader import send_imap_id, get_imap_server -# 邮件域名常量 -DOMAIN_YAHOO = "yahoo.com" -DOMAIN_SINA = "sina.com" -DOMAIN_HOTMAIL = "hotmail.com" -DOMAIN_TIM_IT = "tim.it" -DOMAIN_163 = "163.com" -DOMAIN_RAMBLER_RU = "rambler.ru" -DOMAIN_ALICE_IT = "alice.it" -DOMAIN_MARS_DTI_NE_JP = "mars.dti.ne.jp" -DOMAIN_BTVM_NE_JP = "btvm.ne.jp" -DOMAIN_AURORA_DTI_NE_JP = "aurora.dti.ne.jp" -DOMAIN_GMAIL = "gmail.com" -DOMAIN_GMX = "gmx.com" -DOMAIN_GMX_NET = "gmx.net" -DOMAIN_GMX_AT = "gmx.at" -DOMAIN_GMX_FR = "gmx.fr" -DOMAIN_GMX_US = "gmx.us" -DOMAIN_GMX_SG = "gmx.sg" -DOMAIN_GMX_CH = "gmx.ch" -DOMAIN_GMX_PT = "gmx.pt" -DOMAIN_ONET = "onet.pl" -DOMAIN_GAZETA_PL = "gazeta.pl" -DOMAIN_NAVER = "naver.com" -DOMAIN_INBOX_LV = "inbox.lv" -DOMAIN_GMX_DE = "gmx.de" -# 垃圾邮件域名 -DOMAIN_PISS_MAIL = "pissmail.com" -DOMAIN_INCEL_EMAIL = "incel.email" +# ── Constantes de domaine (conservées pour la compatibilité des imports externes) ── + +DOMAIN_YAHOO = "yahoo.com" +DOMAIN_SINA = "sina.com" +DOMAIN_HOTMAIL = "hotmail.com" +DOMAIN_TIM_IT = "tim.it" +DOMAIN_163 = "163.com" +DOMAIN_RAMBLER_RU = "rambler.ru" +DOMAIN_ALICE_IT = "alice.it" +DOMAIN_MARS_DTI_NE_JP = "mars.dti.ne.jp" +DOMAIN_BTVM_NE_JP = "btvm.ne.jp" +DOMAIN_AURORA_DTI_NE_JP = "aurora.dti.ne.jp" +DOMAIN_GMAIL = "gmail.com" +DOMAIN_GMX = "gmx.com" +DOMAIN_GMX_NET = "gmx.net" +DOMAIN_GMX_AT = "gmx.at" +DOMAIN_GMX_FR = "gmx.fr" +DOMAIN_GMX_US = "gmx.us" +DOMAIN_GMX_SG = "gmx.sg" +DOMAIN_GMX_CH = "gmx.ch" +DOMAIN_GMX_PT = "gmx.pt" +DOMAIN_ONET = "onet.pl" +DOMAIN_GAZETA_PL = "gazeta.pl" +DOMAIN_NAVER = "naver.com" +DOMAIN_INBOX_LV = "inbox.lv" +DOMAIN_GMX_DE = "gmx.de" +DOMAIN_PISS_MAIL = "pissmail.com" +DOMAIN_INCEL_EMAIL = "incel.email" DOMAIN_SHITPOSTING_EXPERT = "shitposting.expert" -DOMAIN_HATESJE_WS = "hatesje.ws" -DOMAIN_CHILD_PIZZA = "child.pizza" -DOMAIN_GENOCIDE_FUN = "genocide.fun" -DOMAIN_DMC_CHAT = "dmc.chat" -DOMAIN_WEB_DE = "web.de" -DOMAIN_OUTLOOK_COM = "outlook.com" -DOMAIN_FIREMAIL_DE = "firemail.de" +DOMAIN_HATESJE_WS = "hatesje.ws" +DOMAIN_CHILD_PIZZA = "child.pizza" +DOMAIN_GENOCIDE_FUN = "genocide.fun" +DOMAIN_DMC_CHAT = "dmc.chat" +DOMAIN_WEB_DE = "web.de" +DOMAIN_OUTLOOK_COM = "outlook.com" +DOMAIN_FIREMAIL_DE = "firemail.de" -# IMAP服务器地址常量 -AOL_IMAP_SERVER = "imap.aol.com" -IMAP_SERVER_163 = "imap.163.com" -IMAP_SERVER_SINA = "imap.sina.com" -YAHOO_IMAP_SERVER = "imap.mail.yahoo.com" -HOTMAIL_IMAP_SERVER = "outlook.office365.com" +# ── Note : les constantes IMAP_SERVER_* ont été supprimées. ─────────────────── +# Utiliser imap_proxy_reader.IMAP_SERVER_MAP ou imap_proxy_reader.get_imap_server(login) +# pour obtenir le serveur IMAP correspondant à un domaine. -RAMBLER_IMAP_SERVER = "imap.rambler.ru" -ALICE_IMAP_SERVER = "in.alice.it" -TIME_IT_SERVER = "imap.tim.it" -MARS_DTI_NE_JP_SERVER = "imap.cm.dream.jp" -NAVER_SERVER = "imap.naver.com" -BTVM_NE_JP_SERVER = "imap.btvm.ne.jp" -GMAIL_IMAP_SERVER = "imap.gmail.com" -ONET_IMAP_SERVER = "imap.poczta.onet.pl" -GMX_IMAP_SERVER = "imap.gmx.com" -GMX_NET_IMAP_SERVER = "imap.gmx.net" -GMX_AT_IMAP_SERVER = "imap.gmx.at" -FIREMAIL_DE_IMAP_SERVER = "imap.firemail.de" -PISS_MAIL_IMAP_SERVER = "mail.pissmail.com" -INBOX_LV_IMAP_SERVER = "mail.inbox.lv" -WEB_DE_IMAP_SERVER = "imap.web.de" -GAZETA_PL_IMAP_SERVER = "imap.gazeta.pl" + +# ── IMAPClient avec IMAP ID spoofing ───────────────────────────────────────── + +class FingerprintIMAPClient(IMAPClient): + """ + IMAPClient qui envoie automatiquement la commande IMAP ID (RFC 2971) + après chaque login(), pour usurper le fingerprint d'un vrai client mail. + """ + + def login(self, username: str, password: str): + result = super().login(username, password) + send_imap_id(self) + return result + + +# ── Fabrique d'instances IMAPClient ────────────────────────────────────────── def show_folders(imap) -> list: + """Retourne la liste des dossiers IMAP (compatible IMAPClient et imaplib).""" folders = [] - isImapClient = isinstance(imap, IMAPClient) - if not isImapClient: + is_imap_client = isinstance(imap, IMAPClient) + if not is_imap_client: for i in imap.list()[1]: - l = i.decode().split(' "/" ') - if len(l) > 1: - folders.append(l[1]) - if len(folders) == 0: - folders.append('INBOX') - return folders + parts = i.decode().split(' "/" ') + if len(parts) > 1: + folders.append(parts[1]) + if not folders: + folders.append("INBOX") else: - list = imap.list_folders() - for i in list: - name = i[-1] - folders.append(name) - return folders + for info in imap.list_folders(): + folders.append(info[-1]) + return folders -def create_imap(login: str): - # 创建一个IMAP4类实例 - if DOMAIN_163 in login: - imap = IMAPClient(IMAP_SERVER_163, use_uid=True) - elif DOMAIN_YAHOO in login: - imap = IMAPClient(YAHOO_IMAP_SERVER, use_uid=True) - elif DOMAIN_FIREMAIL_DE in login: - imap = IMAPClient(FIREMAIL_DE_IMAP_SERVER, use_uid=True) - elif DOMAIN_GMX in login or DOMAIN_GMX_FR in login or DOMAIN_GMX_US in login or DOMAIN_GMX_CH in login or DOMAIN_GMX_PT in login or DOMAIN_GMX_SG in login: - imap = IMAPClient(GMX_IMAP_SERVER, use_uid=True) - elif DOMAIN_SINA in login: - imap = IMAPClient(IMAP_SERVER_SINA, use_uid=True) - elif DOMAIN_HOTMAIL in login or DOMAIN_OUTLOOK_COM in login: - imap = IMAPClient(HOTMAIL_IMAP_SERVER, use_uid=True) - elif DOMAIN_RAMBLER_RU in login: - imap = IMAPClient(RAMBLER_IMAP_SERVER, use_uid=True) - elif DOMAIN_BTVM_NE_JP in login: - imap = IMAPClient(BTVM_NE_JP_SERVER, use_uid=True) - elif DOMAIN_GMAIL in login: - imap = IMAPClient(GMAIL_IMAP_SERVER, use_uid=True) - elif DOMAIN_ONET in login: - imap = IMAPClient(ONET_IMAP_SERVER, use_uid=True) - elif DOMAIN_TIM_IT in login: - imap = IMAPClient(TIME_IT_SERVER, use_uid=True) - elif DOMAIN_ALICE_IT in login: - imap = IMAPClient(ALICE_IMAP_SERVER, use_uid=True) - elif DOMAIN_MARS_DTI_NE_JP in login: - imap = IMAPClient(MARS_DTI_NE_JP_SERVER, use_uid=True) - elif DOMAIN_AURORA_DTI_NE_JP in login: - imap = IMAPClient(MARS_DTI_NE_JP_SERVER, use_uid=True) - elif DOMAIN_NAVER in login: - imap = IMAPClient(NAVER_SERVER, use_uid=True) - elif DOMAIN_GMX_DE in login or DOMAIN_GMX_NET in login: - imap = IMAPClient(GMX_NET_IMAP_SERVER, use_uid=True) - elif DOMAIN_GMX_AT in login: - imap = IMAPClient(GMX_AT_IMAP_SERVER, use_uid=True) - elif DOMAIN_GAZETA_PL in login: - imap = IMAPClient(GAZETA_PL_IMAP_SERVER, use_uid=True) - elif DOMAIN_INBOX_LV in login: - imap = IMAPClient(INBOX_LV_IMAP_SERVER, use_uid=True) - elif DOMAIN_WEB_DE in login: - imap = IMAPClient(WEB_DE_IMAP_SERVER, use_uid=True) - elif DOMAIN_PISS_MAIL in login or DOMAIN_CHILD_PIZZA in login or DOMAIN_DMC_CHAT in login or DOMAIN_GENOCIDE_FUN in login or DOMAIN_HATESJE_WS in login or DOMAIN_INCEL_EMAIL in login or DOMAIN_SHITPOSTING_EXPERT in login: - imap = IMAPClient(PISS_MAIL_IMAP_SERVER, use_uid=True) - else: - imap = IMAPClient(AOL_IMAP_SERVER, use_uid=True) - return imap \ No newline at end of file +def create_imap(login: str) -> FingerprintIMAPClient: + """ + Crée et retourne un FingerprintIMAPClient connecté au bon serveur IMAP + pour le domaine de l'adresse email fournie. + + La résolution domaine → serveur est déléguée à get_imap_server() + (défini dans imap_proxy_reader, source de vérité unique). + """ + server = get_imap_server(login) + return FingerprintIMAPClient(server, use_uid=True) diff --git a/mail/mail_reader_all_contacts.py b/mail/mail_reader_all_contacts.py index 5784250..4020b5f 100644 --- a/mail/mail_reader_all_contacts.py +++ b/mail/mail_reader_all_contacts.py @@ -2,39 +2,59 @@ import datetime import email import logging import os +import random import re -from concurrent.futures import ThreadPoolExecutor +import time +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed from email.header import decode_header -from email.message import Message -from typing import Union, List, Optional +from typing import Union, List, Optional, Dict from dotenv import load_dotenv -from imapclient import IMAPClient from db.mongo_manager import MONGO_STORE_MANAGER from excel_reader import read_contacts -from mail.mail_constants import DOMAIN_HOTMAIL, create_imap -from mail.imap_proxy_reader import ProxyIMAPClient, ProxyConfig, get_imap_server +from mail.mail_constants import DOMAIN_HOTMAIL, create_imap, show_folders +from mail.imap_proxy_reader import ( + ProxyIMAPClient, ProxyConfig, get_imap_server, + extract_body, + VALIDATION_URL_SUBJECT_FR, VALIDATION_URL_SUBJECT_EN, + VALIDATION_URL_REGEX, DATE_FORMAT, +) +from imapclient import IMAPClient from models.ReserveResultPojo import ReserveResultPojo from models.mail_pojo import MailPojo, MailAddress # Charger les variables d'environnement depuis .env load_dotenv() -# 定义常量 -VALIDATION_URL_SUBJECT_FR = 'Validation de votre demande de rendez-vous' -VALIDATION_URL_SUBJECT_EN = 'Please confirm your appointment request' -VALIDATION_URL_REGEX = r"https:\/\/rendezvousparis.hermes.com\/client\/register\/[A-Z0-9]+\/validate.code=[A-Z0-9]+" +# ── Constantes locales ──────────────────────────────────────────────────────── +# VALIDATION_URL_SUBJECT_FR, VALIDATION_URL_SUBJECT_EN, VALIDATION_URL_REGEX, +# DATE_FORMAT sont importés depuis imap_proxy_reader (source de vérité unique). + PART_VALIDATION_URL_REGEX = r"client\/register\/[A-Z0-9]+\/validate.code=[A-Z0-9]+" HERMES_EMAIL = "no-reply@hermes.com" EMAIL_ADDRESS_REGEX = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b' -# 日期格式 -DATE_FORMAT = "%d-%b-%Y" - # Timeouts GMX (en secondes) IMAP_SOCKET_TIMEOUT = 300 # timeout socket pour chaque opération IMAP FUTURE_TIMEOUT = 600 # durée max allouée à la lecture d'une boîte mail +# 按域名限制的最大并发线程数(防止单服务商触发风控) +MAX_WORKERS_PER_DOMAIN: Dict[str, int] = { + "gmx": 80, + "aol": 5, + "gmail": 3, + "yahoo": 5, + "outlook": 5, + "hotmail": 5, + "firemail": 5, + "inbox.lv": 5, + "default": 5, +} + +# 两次读取同一邮箱的最短间隔(分钟),避免频繁重复登录 +MAIL_READ_MIN_INTERVAL_MINUTES = 15 + # GMX域名列表(用于判断是否需要使用代理) GMX_DOMAINS = ( "gmx.com", "gmx.net", "gmx.de", "gmx.at", @@ -42,8 +62,9 @@ GMX_DOMAINS = ( ) # 需要通过代理读取的域名列表 -# PROXY_DOMAINS = GMX_DOMAINS + ("inbox.lv",) -PROXY_DOMAINS = GMX_DOMAINS +PROXY_DOMAINS = GMX_DOMAINS + ("yahoo.com",) +# PROXY_DOMAINS = GMX_DOMAINS + ("yahoo.com",) +# PROXY_DOMAINS = GMX_DOMAINS def is_gmx_account(login: str) -> bool: @@ -55,6 +76,18 @@ def is_proxy_account(login: str) -> bool: """判断邮箱是否需要通过代理读取(GMX 或 inbox.lv)""" return any(d in login.lower() for d in PROXY_DOMAINS) + +def get_domain_group(login: str) -> str: + """ + 将邮箱地址映射到域名分组键,用于限流。 + 例如: "user@gmx.net" → "gmx", "user@aol.com" → "aol" + """ + login_lower = login.lower() + for key in MAX_WORKERS_PER_DOMAIN: + if key != "default" and key in login_lower: + return key + return "default" + # 邮箱列表(简化为常量) REDIRECTION_MAILS = "appointment2022@aol.com, chenpeijun@aol.com,hongjiang176@aol.com,ciyuexie@aol.com,rutger.62@aol.com,ciccidaniel@aol.com,armasgoodman@aol.com,wknd.gemerine@aol.com,rafmail1981@aol.com,tonovichivanenaki@aol.com,hetland.ari@aol.com,mateusiversen@aol.com,lacerdaraffaello@aol.com,anasida76@aol.com,liamolinari@aol.com,sen70zib@aol.com,mezeiderrick@aol.com,stanisl49avchic@aol.com,damcvrobaneuron@aol.com,suyzanna_fleona@aol.com,dxealing.dissa@aol.com,hogg.karen@aol.com,obocharovamarina@aol.com,buchholzjohann@aol.com,orn.cecchini@aol.com,percivaltorgersen@aol.com,candalgudrun@aol.com,filimonis.76@aol.com,bengann_100@aol.com,axelhanne@aol.com,tiffanylarochelle@aol.com,nicoleta.r@aol.com,eichenbaum.1963@aol.com,kotensasharev@aol.com,samognat32@aol.com,edem_headshot@aol.com,kozmakuzmich1960@aol.com,damonsvensson@aol.com,anders.riva@aol.com,caiminwei123@gmail.com,yulingguo086@gmail.com,yingxiaolu086@gmail.com,lijiazhen0035@gmail.com,fangp370@gmail.com,huangyayu10086@gmail.com,fuziyuan110@gmail.com,xinyingdu886@gmail.com,yasiaforever.1971@aol.com,lukaszfidalgo@aol.com,zaichi29@aol.com,prostotakitak.1974@aol.com,mo90nroe@aol.com,blonde.87@aol.com,dimidrol.1969@aol.com" @@ -95,39 +128,29 @@ class MailReader: """邮件读取器类""" def __init__(self, login: str, password: str, proxy: Optional[ProxyConfig] = None, - failed_gmx_list: Optional[List[str]] = None): + failed_gmx_list: Optional[List[str]] = None, + delay_range: tuple = (1.0, 5.0)): self.login = login self.password = password self.proxy = proxy self.failed_gmx_list = failed_gmx_list if failed_gmx_list is not None else [] - - @staticmethod - def show_folders(imap) -> List[str]: - """获取邮箱文件夹列表""" - folders = [] - is_imap_client = isinstance(imap, IMAPClient) - - if not is_imap_client: - # 处理非IMAPClient对象 - for i in imap.list()[1]: - l = i.decode().split(' "/" ') - folders.append(l[1]) - else: - # 处理IMAPClient对象 - folder_list = imap.list_folders() - for i in folder_list: - name = i[-1] - folders.append(name) - - return folders + self.delay_range = delay_range # (min_seconds, max_seconds) 随机延迟范围 def read_emails(self, mails_messages: List[MailPojo]) -> List[MailPojo]: - """读取邮件""" + """读取邮件(含随机延迟和读取时间记录)""" + # 随机延迟,模拟人工节奏,降低被识别为机器人的概率 + _delay = random.uniform(*self.delay_range) + time.sleep(_delay) + # ── GMX / inbox.lv 账户 → 使用代理连接(失败自动重试最多3次)── if is_proxy_account(self.login) and self.proxy is not None: - return self._read_emails_with_proxy_retry(mails_messages) + result = self._read_emails_with_proxy_retry(mails_messages) else: - return self._read_emails_internal(create_imap(self.login), mails_messages) + result = self._read_emails_internal(create_imap(self.login), mails_messages) + + # 记录本次读取时间,供下次调用的 need_to_check_email 判断间隔 + MONGO_STORE_MANAGER.update_mail_read_time(self.login) + return result def _read_emails_with_proxy_retry( self, @@ -175,8 +198,8 @@ class MailReader: mail_list = [] print("read mails from {}".format(self.login)) - # 获取文件夹列表 - folder_list = self.show_folders(imap) + # 获取文件夹列表(委托给 mail_constants.show_folders) + folder_list = show_folders(imap) # 处理每个文件夹 for folder in folder_list: @@ -204,88 +227,72 @@ class MailReader: return mail_list def _get_messages_from_folder(self, imap, subject: str, folder: str = "INBOX") -> List[MailPojo]: - """从指定文件夹获取邮件(传统IMAP方式)""" + """从指定文件夹获取邮件(传统IMAP方式,批量fetch减少往返次数)""" imap.select(folder) mail_messages = [] - # 搜索邮件 + # 搜索符合条件的所有邮件ID search_query = '(SUBJECT "{}" SINCE "{}")'.format(subject, datetime.datetime.today().strftime(DATE_FORMAT)) typ, data = imap.search(None, search_query) - for i in data[0].split(): + ids = data[0].split() + if not ids: + return mail_messages + + # 批量fetch:一次请求取回所有匹配邮件,减少 N 次往返为 1 次 + id_list = b",".join(ids) + try: + res, msg_list = imap.fetch(id_list, "(RFC822)") + except Exception as error: + print("Batch fetch error in folder {}: {}".format(folder, error)) + return mail_messages + + for response in msg_list: + if not isinstance(response, tuple): + continue try: - # 获取邮件内容 - res, msg = imap.fetch(i.decode("utf-8"), "(RFC822)") + email_message = email.message_from_bytes(response[1]) - # 解析邮件 - for response in msg: - if isinstance(response, tuple): - email_message = email.message_from_bytes(response[1]) + # 解码主题 + subject_decoded, subject_encoded = decode_header(email_message["Subject"])[0] + if isinstance(subject_decoded, bytes): + subject_decoded = subject_decoded.decode(subject_encoded) - # 解码主题 - subject, subject_encoded = decode_header(email_message["Subject"])[0] - if isinstance(subject, bytes): - subject = subject.decode(subject_encoded) + # 解码发件人地址 + from_address = find_from_mail(decode_header(email_message.get("From"))) - # 解码发件人地址 - from_address = find_from_mail(decode_header(email_message.get("From"))) + # 解码收件人地址 + to_email = find_from_mail(decode_header(email_message.get("To"))) - # 解码收件人地址 - to_email = find_from_mail(decode_header(email_message.get("To"))) + print("Email:", self.login) + print("From:", from_address) + print("To:", to_email) + print("Subject:", subject_decoded) - print("Email:", self.login) - print("From:", from_address) - print("To:", to_email) - print("Subject:", subject) + # 获取邮件正文(委托给 imap_proxy_reader.extract_body) + body = extract_body(email_message) - # 获取邮件正文 - body = self._extract_body(email_message) + # 检查是否是预约验证邮件 + if VALIDATION_URL_SUBJECT_FR in subject_decoded or VALIDATION_URL_SUBJECT_EN in subject_decoded: + mail = MailPojo( + subject=subject_decoded, + body=body, + from_address=from_address + ) - # 检查是否是预约验证邮件 - if VALIDATION_URL_SUBJECT_FR in subject or VALIDATION_URL_SUBJECT_EN in subject: - mail = MailPojo( - subject=subject, - body=body, - from_address=from_address - ) + # 设置收件人地址 + if to_email is None: + mail.to_address = self.login + else: + mail.to_address = to_email - # 设置收件人地址 - if to_email is None: - mail.to_address = self.login - else: - mail.to_address = to_email - - mail.mail_address = self.login - mail_messages.append(mail) + mail.mail_address = self.login + mail_messages.append(mail) except Exception as error: print("Error processing email: {}".format(error)) return mail_messages - def _extract_body(self, email_message: Message) -> str: - """提取邮件正文""" - body = "" - - # 遍历邮件部分 - for part in email_message.walk(): - try: - content_type = part.get_content_type() - - if content_type == "text/html": - # 处理HTML内容 - payload = part.get_payload(decode=True) - if payload: - body += payload.decode("utf-8", errors="ignore") - elif content_type == "text/plain": - # 处理纯文本内容 - payload = part.get_payload() - if payload: - body += payload - except Exception as error: - print("Error extracting body part: {}".format(error)) - - return body - def _get_messages_from_folder_for_imapclient(self, imap, folder: str = "INBOX") -> List[MailPojo]: """从指定文件夹获取邮件(IMAPClient方式)""" mail_messages = [] @@ -308,8 +315,8 @@ class MailReader: email_message = email.message_from_bytes(message_data[b'RFC822']) # 获取发件人和主题 - from_address = email_message.get('FROM') - subject = email_message.get('subject') + from_address = email_message.get('FROM') or "" + subject = email_message.get('subject') or "" # 检查是否是Hermes邮件 hermes_mail_address = "no-reply@hermes.com" @@ -317,8 +324,8 @@ class MailReader: "outlook.com" in from_address or "hotmail" in from_address): - # 提取邮件正文 - body = self._extract_body_for_imapclient(email_message) + # 提取邮件正文(委托给 imap_proxy_reader.extract_body) + body = extract_body(email_message) # 检查是否是预约验证邮件 if (VALIDATION_URL_SUBJECT_FR in subject or @@ -351,24 +358,6 @@ class MailReader: return mail_messages - def _extract_body_for_imapclient(self, email_message: Message) -> str: - """提取IMAPClient邮件正文""" - body = "" - - for part in email_message.walk(): - content_type = part.get_content_type() - - if content_type == "text/html": - payload = part.get_payload(decode=True) - if payload: - body += payload.decode("utf-8", errors="ignore") - elif content_type == "text/plain": - payload = part.get_payload() - if payload: - body += payload - - return body - # 邮件处理相关函数 def find_item_by_url(url: str, successful_items) -> Union[None, ReserveResultPojo]: @@ -405,57 +394,120 @@ def need_to_valid_url(url: str, item: Union[ReserveResultPojo, None]) -> bool: def need_to_check_email(mail: str, successful_items) -> bool: - """判断是否需要检查邮件""" + """ + 判断是否需要检查邮件。 + 两种情况跳过: + 1. 该邮箱已有成功验证记录(原逻辑) + 2. 距上次读取不足 MAIL_READ_MIN_INTERVAL_MINUTES 分钟(防频繁重复登录) + """ print("successful_items size is " + str(len(successful_items))) - # 过滤已验证的项目 + # 原逻辑:已有成功验证则跳过 filtered_items = [item for item in successful_items if item.email == mail] - - # 检查是否有已验证的项目 validated_items = [item for item in filtered_items if item.url_validated is not None and item.url_validated is True] + if len(validated_items) > 0: + return False - return len(validated_items) == 0 + # 新逻辑:距上次读取时间太短则跳过 + last_read = MONGO_STORE_MANAGER.get_last_mail_read_time(mail) + if last_read is not None: + elapsed_minutes = (datetime.datetime.utcnow() - last_read).total_seconds() / 60 + if elapsed_minutes < MAIL_READ_MIN_INTERVAL_MINUTES: + print("[跳过] {} 距上次读取仅 {:.1f} 分钟,未达到最小间隔 {} 分钟".format( + mail, elapsed_minutes, MAIL_READ_MIN_INTERVAL_MINUTES)) + return False + + return True def find_links_to_validate_from_mail_list( mail_list: List[MailAddress], logger, proxy: Optional[ProxyConfig] = None, + proxy_pool: Optional[List[ProxyConfig]] = None, ) -> List[str]: - """从邮件列表中查找需要验证的链接,返回读取失败的GMX账户列表""" + """ + 从邮件列表中查找需要验证的链接,返回读取失败的GMX账户列表。 + + 参数 + ---- + proxy : 单一代理(GMX专用,兼容旧调用方式) + proxy_pool : 代理列表(非GMX账号也会轮换使用;若为空则非GMX走直连) + """ if not mail_list: return [] - # 检查时间前开始检查邮件 contact_to_book_list = MONGO_STORE_MANAGER.get_all_contact_to_book_list() successful_items = MONGO_STORE_MANAGER.get_all_successful_items_for_day() mails_messages = [] failed_gmx: List[str] = [] - # 使用线程池处理邮件 - with ThreadPoolExecutor(max_workers=100) as executor: - futures = [] + # ── 按域名分组,每组使用独立线程池限流 ──────────────────────────── + # domain_group → [(MailAddress, ProxyConfig|None), ...] + grouped: Dict[str, List[tuple]] = defaultdict(list) - for mail in mail_list: - # 检查是否需要读取邮件 - if need_to_check_email(mail.mail, successful_items): - mail_reader = MailReader(mail.mail, mail.password, proxy=proxy, - failed_gmx_list=failed_gmx) - future = executor.submit(mail_reader.read_emails, mails_messages) - futures.append(future) + for idx, mail in enumerate(mail_list): + if not need_to_check_email(mail.mail, successful_items): + continue - # 等待所有任务完成 - for future in futures: - try: - future.result(timeout=FUTURE_TIMEOUT) - except TimeoutError: - print("⏱️ Timeout ({} s) dépassé pour une boîte mail — lecture ignorée.".format(FUTURE_TIMEOUT)) - except Exception as e: - print("Error processing mail: {},login: {}, password: {}".format(e,mail.mail, mail.password)) + # 为账号分配代理 + if is_proxy_account(mail.mail): + # GMX / inbox.lv → 使用专用 GMX 代理 + assigned_proxy = proxy + elif proxy_pool: + # 非GMX + 有代理池 → 按索引轮换分配 + assigned_proxy = proxy_pool[idx % len(proxy_pool)] + else: + # 无代理池 → 直连 + assigned_proxy = None - # ── Résumé des comptes proxy en échec ────────────────────── + group_key = get_domain_group(mail.mail) + grouped[group_key].append((mail, assigned_proxy)) + + # ── 每个域名分组启动独立线程池 ──────────────────────────────────── + # future → mail address,用于进度显示 + future_to_mail: Dict[object, str] = {} + executors = [] + + for group_key, items in grouped.items(): + max_w = MAX_WORKERS_PER_DOMAIN.get(group_key, MAX_WORKERS_PER_DOMAIN["default"]) + executor = ThreadPoolExecutor(max_workers=max_w) + executors.append(executor) + + print("[限流] 域名组 '{}': {} 账号,max_workers={}".format( + group_key, len(items), max_w)) + + for mail, assigned_proxy in items: + mail_reader = MailReader( + mail.mail, + mail.password, + proxy=assigned_proxy, + failed_gmx_list=failed_gmx, + ) + future = executor.submit(mail_reader.read_emails, mails_messages) + future_to_mail[future] = mail.mail + + # ── 等待所有任务完成,然后关闭线程池 ───────────────────────────── + total = len(future_to_mail) + completed = 0 + for future in as_completed(future_to_mail): + mail_addr = future_to_mail[future] + completed += 1 + try: + future.result(timeout=FUTURE_TIMEOUT) + print("[进度] {}/{} {}".format(completed, total, mail_addr)) + except TimeoutError: + print("[进度] {}/{} {} — Timeout ({} s), lecture ignorée.".format( + completed, total, mail_addr, FUTURE_TIMEOUT)) + except Exception as e: + print("[进度] {}/{} {} — Erreur: {}".format(completed, total, mail_addr, e)) + + for executor in executors: + executor.shutdown(wait=False) + + # ── 输出代理账号读取摘要 ────────────────────────────────────────── if failed_gmx: print("\n[Proxy] ⚠️ {} compte(s) non lus (GMX / inbox.lv) :".format(len(failed_gmx))) for addr in failed_gmx: @@ -463,10 +515,9 @@ def find_links_to_validate_from_mail_list( else: print("\n[Proxy] ✅ Tous les comptes GMX / inbox.lv ont été lus avec succès.") - # 刷新成功的项目 + # ── 处理邮件中的验证链接 ────────────────────────────────────────── _refreshed_successful_items = MONGO_STORE_MANAGER.get_all_successful_items_for_day() - # 处理邮件中的链接 for mail in mails_messages: match = re.search(VALIDATION_URL_REGEX, mail.body) if match: @@ -485,7 +536,7 @@ def find_links_to_validate_from_mail_list( url, mail.to_address, model=_model, - _all_contact_list=contact_to_book_list, _used_ip= _used_ip) + _all_contact_list=contact_to_book_list, _used_ip=_used_ip) else: logger.info("do not need to click url --> {}".format(mail.mail_address)) @@ -500,7 +551,8 @@ if __name__ == '__main__': # file_name="~/Desktop/contact_list_2026-04-21_200_yahoo.xlsx") # file_name="~/Desktop/contact_list_yahoo_100_20_04.xlsx") # file_name="~/Desktop/contact_yahoo_5.xlsx") - file_name="~/Desktop/contact_list_2026-04-22.xlsx") + # file_name="~/Desktop/contact_list_2026-04-24_yahoo_50.xlsx") + file_name="~/Desktop/contact_list_2026-04-23.xlsx") # file_name="~/Desktop/contact_list_2026-04-11.xlsx") # file_name="~/Desktop/contact_list_2026-04-17.xlsx") # file_name="~/Desktop/contact_list_inbox_100_14_04.xlsx") @@ -532,7 +584,7 @@ if __name__ == '__main__': _to_add = False if _to_add: filter_mail.append(mail_pojo) - # filter_mail = [MailAddress("minnakan@firemail.de", "Yjn8nQ0sZ")] + # filter_mail = [MailAddress("pishikmamn@gmx.de", "53OBns2jAXE")] # ── Mode de lecture : GMX_ONLY=true → uniquement les comptes GMX ── gmx_only = os.environ.get("GMX_ONLY", "false").strip().lower() == "true" @@ -550,8 +602,29 @@ if __name__ == '__main__': username=os.environ.get("GMX_PROXY_USERNAME"), password=os.environ.get("GMX_PROXY_PASSWORD"), ) + + # 非GMX账号代理池(可配置多个,轮换使用;留空则直连) + # 格式:PROXY_POOL_HOSTS="host1:port1,host2:port2",与 GMX_PROXY 同类型 + _proxy_pool_raw = os.environ.get("PROXY_POOL_HOSTS", "").strip() + non_gmx_proxy_pool: Optional[List[ProxyConfig]] = None + if _proxy_pool_raw: + non_gmx_proxy_pool = [] + for entry in _proxy_pool_raw.split(","): + entry = entry.strip() + if ":" in entry: + _h, _p = entry.rsplit(":", 1) + non_gmx_proxy_pool.append(ProxyConfig( + host=_h, + port=int(_p), + proxy_type=os.environ.get("GMX_PROXY_TYPE", "SOCKS5"), + username=os.environ.get("GMX_PROXY_USERNAME"), + password=os.environ.get("GMX_PROXY_PASSWORD"), + )) + # 处理邮件 - failed = find_links_to_validate_from_mail_list(filter_mail, logger, proxy=gmx_proxy) + failed = find_links_to_validate_from_mail_list( + filter_mail, logger, proxy=gmx_proxy, proxy_pool=non_gmx_proxy_pool + ) # ── Afficher les comptes GMX non lus ───────────────────── if failed: diff --git a/request_sender_test.py b/request_sender_test.py index 5a85f9b..3b1e53e 100644 --- a/request_sender_test.py +++ b/request_sender_test.py @@ -104,7 +104,8 @@ if __name__ == '__main__': # file_list = ['~/Desktop/contact_list_inbox_lv_100.xlsx'] # file_list = ['~/Desktop/contact_list_yahoo_100_20_04.xlsx'] # file_list = ['~/Desktop/contact_list_2026-04-21_200_yahoo.xlsx'] - file_list = ['~/Desktop/contact_list_2026-04-21.xlsx'] + file_list = ['~/Desktop/contact_list_2026-04-23.xlsx'] + # file_list = ['~/Desktop/contact_list_2026-04-24_yahoo_50.xlsx'] # file_list = ['~/Desktop/reste_inbox_lv.xlsx'] # file_list = ['~/Desktop/contact_list_2024-09-02_firemail_de_100.xlsx'] # file_list = ['~/Desktop/contact_list_inbox_100_14_04.xlsx']