diff --git a/src/mail/imap_proxy_reader.py b/src/mail/imap_proxy_reader.py index 12404da..dd567e1 100644 --- a/src/mail/imap_proxy_reader.py +++ b/src/mail/imap_proxy_reader.py @@ -15,6 +15,7 @@ Dépendances : import datetime import email +import hashlib import imaplib import io import logging @@ -23,6 +24,7 @@ import re import socket import ssl import sys +import time from dataclasses import dataclass from email.message import Message from typing import List, Optional, Tuple @@ -37,61 +39,53 @@ load_dotenv() # Constantes # ────────────────────────────────────────────────────────────── -VALIDATION_URL_SUBJECT_FR = "Validation de votre demande de rendez-vous" -VALIDATION_URL_SUBJECT_EN = "Please confirm your appointment request" -VALIDATION_URL_REGEX = ( - r"https:\/\/rendezvousparis\.hermes\.com" - r"\/client\/register\/[A-Z0-9]+\/validate\.code=[A-Z0-9]+" -) - DATE_FORMAT = "%d-%b-%Y" # Correspondance domaine → serveur IMAP (identique à mail_constants.py) IMAP_SERVER_MAP: List[Tuple[str, str]] = [ - ("163.com", "imap.163.com"), - ("yahoo.com", "imap.mail.yahoo.com"), - ("firemail.de", "imap.firemail.de"), - ("gmail.com", "imap.gmail.com"), - ("sina.com", "imap.sina.com"), - ("hotmail.com", "outlook.office365.com"), - ("outlook.com", "outlook.office365.com"), - ("rambler.ru", "imap.rambler.ru"), - ("btvm.ne.jp", "imap.btvm.ne.jp"), - ("mars.dti.ne.jp", "imap.cm.dream.jp"), - ("aurora.dti.ne.jp", "imap.cm.dream.jp"), - ("naver.com", "imap.naver.com"), - ("onet.pl", "imap.poczta.onet.pl"), - ("gazeta.pl", "imap.gazeta.pl"), - ("tim.it", "imap.tim.it"), - ("alice.it", "in.alice.it"), - ("gmx.com", "imap.gmx.com"), - ("gmx.fr", "imap.gmx.com"), - ("gmx.us", "imap.gmx.com"), - ("gmx.ch", "imap.gmx.com"), - ("gmx.pt", "imap.gmx.com"), - ("gmx.sg", "imap.gmx.com"), - ("gmx.net", "imap.gmx.net"), - ("gmx.de", "imap.gmx.net"), - ("gmx.at", "imap.gmx.at"), - ("web.de", "imap.web.de"), - ("inbox.lv", "mail.inbox.lv"), - ("pissmail.com", "mail.pissmail.com"), - ("incel.email", "mail.pissmail.com"), - ("shitposting.expert","mail.pissmail.com"), - ("hatesje.ws", "mail.pissmail.com"), - ("child.pizza", "mail.pissmail.com"), - ("genocide.fun", "mail.pissmail.com"), - ("dmc.chat", "mail.pissmail.com"), - ("aol.com", "imap.aol.com"), # fallback AOL + ("163.com", "imap.163.com"), + ("yahoo.com", "imap.mail.yahoo.com"), + ("firemail.de", "imap.firemail.de"), + ("gmail.com", "imap.gmail.com"), + ("sina.com", "imap.sina.com"), + ("hotmail.com", "outlook.office365.com"), + ("outlook.com", "outlook.office365.com"), + ("rambler.ru", "imap.rambler.ru"), + ("btvm.ne.jp", "imap.btvm.ne.jp"), + ("mars.dti.ne.jp", "imap.cm.dream.jp"), + ("aurora.dti.ne.jp", "imap.cm.dream.jp"), + ("naver.com", "imap.naver.com"), + ("onet.pl", "imap.poczta.onet.pl"), + ("gazeta.pl", "imap.gazeta.pl"), + ("tim.it", "imap.tim.it"), + ("alice.it", "in.alice.it"), + ("gmx.com", "imap.gmx.com"), + ("gmx.fr", "imap.gmx.com"), + ("gmx.us", "imap.gmx.com"), + ("gmx.ch", "imap.gmx.com"), + ("gmx.pt", "imap.gmx.com"), + ("gmx.sg", "imap.gmx.com"), + ("gmx.net", "imap.gmx.net"), + ("gmx.de", "imap.gmx.net"), + ("gmx.at", "imap.gmx.at"), + ("web.de", "imap.web.de"), + ("inbox.lv", "mail.inbox.lv"), + ("pissmail.com", "mail.pissmail.com"), + ("incel.email", "mail.pissmail.com"), + ("shitposting.expert", "mail.pissmail.com"), + ("hatesje.ws", "mail.pissmail.com"), + ("child.pizza", "mail.pissmail.com"), + ("genocide.fun", "mail.pissmail.com"), + ("dmc.chat", "mail.pissmail.com"), + ("aol.com", "imap.aol.com"), # fallback AOL ] PROXY_TYPE_MAP = { "SOCKS5": socks.SOCKS5, "SOCKS4": socks.SOCKS4, - "HTTP": socks.HTTP, + "HTTP": socks.HTTP, } - logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) logger.addHandler(logging.StreamHandler(stream=sys.stdout)) @@ -106,7 +100,7 @@ class ProxyConfig: """Configuration du proxy.""" host: str port: int - proxy_type: str = "SOCKS5" # "SOCKS5" | "SOCKS4" | "HTTP" + proxy_type: str = "SOCKS5" # "SOCKS5" | "SOCKS4" | "HTTP" username: Optional[str] = None password: Optional[str] = None @@ -138,6 +132,8 @@ class MailResult: from_address: str to_address: str body: str + message_id: str = "" # Header Message-ID + validation_url: str = "" # Première URL Hermes trouvée dans le corps # ────────────────────────────────────────────────────────────── @@ -151,12 +147,12 @@ class ProxyIMAP4_TLS(imaplib.IMAP4): """ def __init__( - self, - host: str, - port: int, - ssl_context: Optional[ssl.SSLContext], - proxy: ProxyConfig, - timeout: Optional[float] = None, + self, + host: str, + port: int, + ssl_context: Optional[ssl.SSLContext], + proxy: ProxyConfig, + timeout: Optional[float] = None, ): self._ssl_context = ssl_context self._proxy = proxy @@ -228,11 +224,11 @@ class ProxyIMAPClient(IMAPClient): """ def __init__( - self, - host: str, - proxy: ProxyConfig, - subjects: Optional[List[str]] = None, - **kwargs, + self, + host: str, + proxy: ProxyConfig, + subjects: Optional[List[str]] = None, + **kwargs, ): self._proxy = proxy # Sujets à rechercher, injectables depuis l'extérieur @@ -256,35 +252,11 @@ class ProxyIMAPClient(IMAPClient): ) def search_by_subjects( - self, - since: Optional[datetime.datetime] = None, - extra_criteria: Optional[List] = None, + self, + since: Optional[datetime.datetime] = None, + extra_criteria: Optional[List] = None, ) -> List[int]: - """ - Recherche les UIDs des emails dont le sujet correspond à l'un - des sujets stockés dans ``self.subjects``. - - Si ``self.subjects`` est vide, retourne tous les messages - depuis ``since`` (sans filtre par sujet). - - Paramètres - ---------- - since : datetime, optional - Filtre SINCE (aujourd'hui par défaut). - extra_criteria : list, optional - Critères IMAP supplémentaires à combiner (AND implicite). - - Retourne - -------- - list[int] — UIDs correspondants (peut être vide). - - Exemple - ------- - client.subjects = ["Confirmation RDV", "confirmed"] - uids = client.search_by_subjects(since=datetime.datetime.today()) - """ - since = since or datetime.datetime.today() - base: List = ["SINCE", since] + base: List = ["SINCE", datetime.datetime.today()] if extra_criteria: base.extend(extra_criteria) @@ -336,9 +308,27 @@ def extract_body(email_message: Message) -> str: return body -def find_validation_urls(text: str) -> List[str]: - """Recherche toutes les URLs de validation Hermes dans un texte.""" - return re.findall(VALIDATION_URL_REGEX, text) +def _dedup_key(result: MailResult) -> tuple: + """ + Calcule une clé de déduplication pour un MailResult. + + Priorité : + 1. URL de validation Hermes — unique par rendez-vous, 100 % fiable + 2. Message-ID — unique par email selon RFC 5322 + 3. hash MD5 du corps complet — fallback contenu quand les deux + champs précédents sont absents + (ex : certains serveurs 163.com / Yahoo + n'ajoutent pas de Message-ID et peuvent + présenter le même email depuis plusieurs + dossiers avec des corps légèrement + différents en encodage — on normalise + avant de hacher) + """ + # Normalisation avant hash : on retire les espaces/sauts de ligne + # superflus pour absorber les différences mineures d'encodage + normalized = re.sub(r"\s+", " ", result.body).strip() + body_hash = hashlib.md5(normalized.encode("utf-8", errors="ignore")).hexdigest() + return ("body", body_hash) # ────────────────────────────────────────────────────────────── @@ -368,16 +358,20 @@ class ProxyMailReader: """ def __init__( - self, - account: MailAccount, - proxy: ProxyConfig, - timeout: float = 30.0, - subjects: Optional[List[str]] = None, - from_addresses: Optional[List[str]] = None, + self, + account: MailAccount, + proxy: ProxyConfig, + timeout: float = 30.0, + subjects: Optional[List[str]] = None, + from_addresses: Optional[List[str]] = None, + max_retries: int = 8, + retry_delay: float = 2.0, ): self.account = account self.proxy = proxy self.timeout = timeout + self.max_retries = max_retries + self.retry_delay = retry_delay self._subjects = [] if subjects: self._subjects.extend(subjects) @@ -390,21 +384,47 @@ class ProxyMailReader: def _connect(self) -> ProxyIMAPClient: imap_server = get_imap_server(self.account.login) - logger.info( - "[%s] Connexion via %s → %s:993", - self.account.login, self.proxy, imap_server, - ) - client = ProxyIMAPClient( - host=imap_server, - proxy=self.proxy, - subjects=self._subjects, # propagation des sujets vers le client bas niveau - use_uid=True, - ssl=True, - timeout=self.timeout, - ) - client.login(self.account.login, self.account.password) - logger.info("[%s] Connecté. Sujets recherchés : %s", self.account.login, self._subjects) - return client + last_exc: Optional[Exception] = None + + for attempt in range(1, self.max_retries + 1): + logger.info( + "[%s] Tentative %d/%d — Connexion via %s → %s:993", + self.account.login, attempt, self.max_retries, + self.proxy, imap_server, + ) + try: + client = ProxyIMAPClient( + host=imap_server, + proxy=self.proxy, + subjects=self._subjects, + use_uid=True, + ssl=True, + timeout=self.timeout, + ) + client.login(self.account.login, self.account.password) + logger.info( + "[%s] Connecté (tentative %d). Sujets recherchés : %s", + self.account.login, attempt, self._subjects, + ) + return client + except Exception as exc: + last_exc = exc + logger.warning( + "[%s] Échec connexion/login (tentative %d/%d) : %s", + self.account.login, attempt, self.max_retries, exc, + ) + if attempt < self.max_retries: + delay = self.retry_delay * (2 * (attempt - 1)) + logger.info( + "[%s] Nouvelle tentative dans %.1f s…", + self.account.login, delay, + ) + time.sleep(delay) + + raise ConnectionError( + f"[{self.account.login}] Impossible de se connecter après " + f"{self.max_retries} tentative(s). Dernière erreur : {last_exc}" + ) from last_exc # ── Lecture des dossiers ───────────────────────────────── @@ -414,13 +434,15 @@ class ProxyMailReader: # ── Lecture des messages ───────────────────────────────── def _read_folder( - self, - client: ProxyIMAPClient, - folder: str, - since: Optional[datetime.datetime] = None, + self, + client: ProxyIMAPClient, + folder: str, + since: Optional[datetime.datetime] = None, ) -> List[MailResult]: results: List[MailResult] = [] since = since or datetime.datetime.today() + # 用于去重:同一主题+发信人只读第一封 + seen_subject_from: set = set() try: client.select_folder(folder, readonly=True) @@ -439,7 +461,7 @@ class ProxyMailReader: if not uids: return results - + print("uids {}".format(uids)) logger.info("[%s] %d message(s) dans '%s'", self.account.login, len(uids), folder) @@ -453,6 +475,19 @@ class ProxyMailReader: subject = em.get("Subject", "") from_addr = em.get("From", "") to_addr = em.get("To", self.account.login) + message_id = em.get("Message-ID", "").strip() + print("subject {}".format(subject)) + print("message_id {}".format(message_id)) + + # 去重:同一主题+发信人只读第一封 + dedup_key = (subject, from_addr) + if dedup_key in seen_subject_from: + logger.debug( + "[%s] Doublon ignoré (même sujet et expéditeur) dans '%s': %s", + self.account.login, folder, subject[:50] + ) + continue + seen_subject_from.add(dedup_key) # Filtrer : on ne garde que les emails correspondant aux sujets/expéditeurs configurés is_validation = ( @@ -468,13 +503,10 @@ class ProxyMailReader: subject=subject, from_address=from_addr, to_address=to_addr, - body=body + body=body, + message_id=message_id ) results.append(result) - logger.info( - "[%s] Email trouvé (uid=%s) — URLs : %s", - self.account.login, uid - ) except Exception as exc: logger.warning( "[%s] Erreur traitement uid=%s : %s", @@ -486,9 +518,9 @@ class ProxyMailReader: # ── Point d'entrée public ──────────────────────────────── def read( - self, - since: Optional[datetime.datetime] = None, - skip_folders: Optional[List[str]] = None, + self, + since: Optional[datetime.datetime] = None, + skip_folders: Optional[List[str]] = None, ) -> List[MailResult]: """ Se connecte au serveur IMAP via le proxy et retourne la liste @@ -498,12 +530,19 @@ class ProxyMailReader: ---------- since : datetime, optional — date de début de recherche skip_folders : list[str], optional — dossiers à ignorer - (défaut : ["Sent", "Drafts", "Trash", "Junk", "Spam"]) + (défaut : ["Sent", "Drafts", "Trash", "Junk", "Spam", + "[Gmail]/All Mail", "[Gmail]/Starred", + "[Gmail]/Important"]) """ if skip_folders is None: - skip_folders = ["Sent", "Drafts", "Trash", "Junk", "Spam"] + skip_folders = [ + "Sent", "Drafts", "Trash", "Junk", "Spam", + # Dossiers Gmail qui dupliquent le contenu d'INBOX + "[Gmail]/All Mail", "[Gmail]/Starred", "[Gmail]/Important", + ] all_results: List[MailResult] = [] + seen_message_ids: set = set() # déduplication inter-dossiers client = self._connect() try: @@ -515,7 +554,18 @@ class ProxyMailReader: logger.debug("[%s] Dossier ignoré : %s", self.account.login, folder) continue - all_results.extend(self._read_folder(client, folder, since)) + + for result in self._read_folder(client, folder, since): + dedup_key = _dedup_key(result) + if dedup_key in seen_message_ids: + logger.debug( + "[%s] Doublon ignoré (clé=%s) dans '%s'", + self.account.login, str(dedup_key)[:40], folder, + ) + continue + + seen_message_ids.add(dedup_key) + all_results.append(result) finally: try: client.logout() @@ -533,11 +583,11 @@ from concurrent.futures import ThreadPoolExecutor, as_completed def read_multiple_accounts( - accounts: List[MailAccount], - proxy: ProxyConfig, - since: Optional[datetime.datetime] = None, - max_workers: int = 10, - timeout: float = 30.0, + accounts: List[MailAccount], + proxy: ProxyConfig, + since: Optional[datetime.datetime] = None, + max_workers: int = 10, + timeout: float = 30.0, ) -> List[MailResult]: """ Lit plusieurs comptes email en parallèle via le même proxy. @@ -588,7 +638,7 @@ if __name__ == "__main__": # ── 2. Définir les comptes à lire ──────────────────────── accounts = [ - MailAccount(login="birgitnaya@gmx.net", password="XEeUF3Y1yaO"), + MailAccount(login="birgitnaya@gmx.net", password="XEeUF3Y1yaO"), # MailAccount(login="user@gmail.com", password="apppassword"), # MailAccount(login="user@outlook.com", password="password"), ] @@ -603,16 +653,13 @@ if __name__ == "__main__": ) # ── 4. Afficher les résultats ──────────────────────────── - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f" {len(results)} email(s) de validation trouvé(s)") - print(f"{'='*60}\n") + print(f"{'=' * 60}\n") for r in results: print(f" Compte : {r.account}") print(f" De : {r.from_address}") print(f" Sujet : {r.subject}") print(f" URLs : {r.validation_urls or 'aucune'}") - print(f" {'-'*56}") - - - + print(f" {'-' * 56}")