need to change read mail body

This commit is contained in:
2026-04-02 23:10:25 +02:00
parent e2c6483911
commit 40d479b2fc
+183 -136
View File
@@ -15,6 +15,7 @@ Dépendances :
import datetime import datetime
import email import email
import hashlib
import imaplib import imaplib
import io import io
import logging import logging
@@ -23,6 +24,7 @@ import re
import socket import socket
import ssl import ssl
import sys import sys
import time
from dataclasses import dataclass from dataclasses import dataclass
from email.message import Message from email.message import Message
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
@@ -37,61 +39,53 @@ load_dotenv()
# Constantes # Constantes
# ────────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────────
VALIDATION_URL_SUBJECT_FR = "Validation de votre demande de rendez-vous"
VALIDATION_URL_SUBJECT_EN = "Please confirm your appointment request"
VALIDATION_URL_REGEX = (
r"https:\/\/rendezvousparis\.hermes\.com"
r"\/client\/register\/[A-Z0-9]+\/validate\.code=[A-Z0-9]+"
)
DATE_FORMAT = "%d-%b-%Y" DATE_FORMAT = "%d-%b-%Y"
# Correspondance domaine → serveur IMAP (identique à mail_constants.py) # Correspondance domaine → serveur IMAP (identique à mail_constants.py)
IMAP_SERVER_MAP: List[Tuple[str, str]] = [ IMAP_SERVER_MAP: List[Tuple[str, str]] = [
("163.com", "imap.163.com"), ("163.com", "imap.163.com"),
("yahoo.com", "imap.mail.yahoo.com"), ("yahoo.com", "imap.mail.yahoo.com"),
("firemail.de", "imap.firemail.de"), ("firemail.de", "imap.firemail.de"),
("gmail.com", "imap.gmail.com"), ("gmail.com", "imap.gmail.com"),
("sina.com", "imap.sina.com"), ("sina.com", "imap.sina.com"),
("hotmail.com", "outlook.office365.com"), ("hotmail.com", "outlook.office365.com"),
("outlook.com", "outlook.office365.com"), ("outlook.com", "outlook.office365.com"),
("rambler.ru", "imap.rambler.ru"), ("rambler.ru", "imap.rambler.ru"),
("btvm.ne.jp", "imap.btvm.ne.jp"), ("btvm.ne.jp", "imap.btvm.ne.jp"),
("mars.dti.ne.jp", "imap.cm.dream.jp"), ("mars.dti.ne.jp", "imap.cm.dream.jp"),
("aurora.dti.ne.jp", "imap.cm.dream.jp"), ("aurora.dti.ne.jp", "imap.cm.dream.jp"),
("naver.com", "imap.naver.com"), ("naver.com", "imap.naver.com"),
("onet.pl", "imap.poczta.onet.pl"), ("onet.pl", "imap.poczta.onet.pl"),
("gazeta.pl", "imap.gazeta.pl"), ("gazeta.pl", "imap.gazeta.pl"),
("tim.it", "imap.tim.it"), ("tim.it", "imap.tim.it"),
("alice.it", "in.alice.it"), ("alice.it", "in.alice.it"),
("gmx.com", "imap.gmx.com"), ("gmx.com", "imap.gmx.com"),
("gmx.fr", "imap.gmx.com"), ("gmx.fr", "imap.gmx.com"),
("gmx.us", "imap.gmx.com"), ("gmx.us", "imap.gmx.com"),
("gmx.ch", "imap.gmx.com"), ("gmx.ch", "imap.gmx.com"),
("gmx.pt", "imap.gmx.com"), ("gmx.pt", "imap.gmx.com"),
("gmx.sg", "imap.gmx.com"), ("gmx.sg", "imap.gmx.com"),
("gmx.net", "imap.gmx.net"), ("gmx.net", "imap.gmx.net"),
("gmx.de", "imap.gmx.net"), ("gmx.de", "imap.gmx.net"),
("gmx.at", "imap.gmx.at"), ("gmx.at", "imap.gmx.at"),
("web.de", "imap.web.de"), ("web.de", "imap.web.de"),
("inbox.lv", "mail.inbox.lv"), ("inbox.lv", "mail.inbox.lv"),
("pissmail.com", "mail.pissmail.com"), ("pissmail.com", "mail.pissmail.com"),
("incel.email", "mail.pissmail.com"), ("incel.email", "mail.pissmail.com"),
("shitposting.expert","mail.pissmail.com"), ("shitposting.expert", "mail.pissmail.com"),
("hatesje.ws", "mail.pissmail.com"), ("hatesje.ws", "mail.pissmail.com"),
("child.pizza", "mail.pissmail.com"), ("child.pizza", "mail.pissmail.com"),
("genocide.fun", "mail.pissmail.com"), ("genocide.fun", "mail.pissmail.com"),
("dmc.chat", "mail.pissmail.com"), ("dmc.chat", "mail.pissmail.com"),
("aol.com", "imap.aol.com"), # fallback AOL ("aol.com", "imap.aol.com"), # fallback AOL
] ]
PROXY_TYPE_MAP = { PROXY_TYPE_MAP = {
"SOCKS5": socks.SOCKS5, "SOCKS5": socks.SOCKS5,
"SOCKS4": socks.SOCKS4, "SOCKS4": socks.SOCKS4,
"HTTP": socks.HTTP, "HTTP": socks.HTTP,
} }
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(stream=sys.stdout)) logger.addHandler(logging.StreamHandler(stream=sys.stdout))
@@ -106,7 +100,7 @@ class ProxyConfig:
"""Configuration du proxy.""" """Configuration du proxy."""
host: str host: str
port: int port: int
proxy_type: str = "SOCKS5" # "SOCKS5" | "SOCKS4" | "HTTP" proxy_type: str = "SOCKS5" # "SOCKS5" | "SOCKS4" | "HTTP"
username: Optional[str] = None username: Optional[str] = None
password: Optional[str] = None password: Optional[str] = None
@@ -138,6 +132,8 @@ class MailResult:
from_address: str from_address: str
to_address: str to_address: str
body: str body: str
message_id: str = "" # Header Message-ID
validation_url: str = "" # Première URL Hermes trouvée dans le corps
# ────────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────────
@@ -151,12 +147,12 @@ class ProxyIMAP4_TLS(imaplib.IMAP4):
""" """
def __init__( def __init__(
self, self,
host: str, host: str,
port: int, port: int,
ssl_context: Optional[ssl.SSLContext], ssl_context: Optional[ssl.SSLContext],
proxy: ProxyConfig, proxy: ProxyConfig,
timeout: Optional[float] = None, timeout: Optional[float] = None,
): ):
self._ssl_context = ssl_context self._ssl_context = ssl_context
self._proxy = proxy self._proxy = proxy
@@ -228,11 +224,11 @@ class ProxyIMAPClient(IMAPClient):
""" """
def __init__( def __init__(
self, self,
host: str, host: str,
proxy: ProxyConfig, proxy: ProxyConfig,
subjects: Optional[List[str]] = None, subjects: Optional[List[str]] = None,
**kwargs, **kwargs,
): ):
self._proxy = proxy self._proxy = proxy
# Sujets à rechercher, injectables depuis l'extérieur # Sujets à rechercher, injectables depuis l'extérieur
@@ -256,35 +252,11 @@ class ProxyIMAPClient(IMAPClient):
) )
def search_by_subjects( def search_by_subjects(
self, self,
since: Optional[datetime.datetime] = None, since: Optional[datetime.datetime] = None,
extra_criteria: Optional[List] = None, extra_criteria: Optional[List] = None,
) -> List[int]: ) -> List[int]:
""" base: List = ["SINCE", datetime.datetime.today()]
Recherche les UIDs des emails dont le sujet correspond à l'un
des sujets stockés dans ``self.subjects``.
Si ``self.subjects`` est vide, retourne tous les messages
depuis ``since`` (sans filtre par sujet).
Paramètres
----------
since : datetime, optional
Filtre SINCE (aujourd'hui par défaut).
extra_criteria : list, optional
Critères IMAP supplémentaires à combiner (AND implicite).
Retourne
--------
list[int] — UIDs correspondants (peut être vide).
Exemple
-------
client.subjects = ["Confirmation RDV", "confirmed"]
uids = client.search_by_subjects(since=datetime.datetime.today())
"""
since = since or datetime.datetime.today()
base: List = ["SINCE", since]
if extra_criteria: if extra_criteria:
base.extend(extra_criteria) base.extend(extra_criteria)
@@ -336,9 +308,27 @@ def extract_body(email_message: Message) -> str:
return body return body
def find_validation_urls(text: str) -> List[str]: def _dedup_key(result: MailResult) -> tuple:
"""Recherche toutes les URLs de validation Hermes dans un texte.""" """
return re.findall(VALIDATION_URL_REGEX, text) Calcule une clé de déduplication pour un MailResult.
Priorité :
1. URL de validation Hermes — unique par rendez-vous, 100 % fiable
2. Message-ID — unique par email selon RFC 5322
3. hash MD5 du corps complet — fallback contenu quand les deux
champs précédents sont absents
(ex : certains serveurs 163.com / Yahoo
n'ajoutent pas de Message-ID et peuvent
présenter le même email depuis plusieurs
dossiers avec des corps légèrement
différents en encodage — on normalise
avant de hacher)
"""
# Normalisation avant hash : on retire les espaces/sauts de ligne
# superflus pour absorber les différences mineures d'encodage
normalized = re.sub(r"\s+", " ", result.body).strip()
body_hash = hashlib.md5(normalized.encode("utf-8", errors="ignore")).hexdigest()
return ("body", body_hash)
# ────────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────────
@@ -368,16 +358,20 @@ class ProxyMailReader:
""" """
def __init__( def __init__(
self, self,
account: MailAccount, account: MailAccount,
proxy: ProxyConfig, proxy: ProxyConfig,
timeout: float = 30.0, timeout: float = 30.0,
subjects: Optional[List[str]] = None, subjects: Optional[List[str]] = None,
from_addresses: Optional[List[str]] = None, from_addresses: Optional[List[str]] = None,
max_retries: int = 8,
retry_delay: float = 2.0,
): ):
self.account = account self.account = account
self.proxy = proxy self.proxy = proxy
self.timeout = timeout self.timeout = timeout
self.max_retries = max_retries
self.retry_delay = retry_delay
self._subjects = [] self._subjects = []
if subjects: if subjects:
self._subjects.extend(subjects) self._subjects.extend(subjects)
@@ -390,21 +384,47 @@ class ProxyMailReader:
def _connect(self) -> ProxyIMAPClient: def _connect(self) -> ProxyIMAPClient:
imap_server = get_imap_server(self.account.login) imap_server = get_imap_server(self.account.login)
logger.info( last_exc: Optional[Exception] = None
"[%s] Connexion via %s%s:993",
self.account.login, self.proxy, imap_server, for attempt in range(1, self.max_retries + 1):
) logger.info(
client = ProxyIMAPClient( "[%s] Tentative %d/%d — Connexion via %s%s:993",
host=imap_server, self.account.login, attempt, self.max_retries,
proxy=self.proxy, self.proxy, imap_server,
subjects=self._subjects, # propagation des sujets vers le client bas niveau )
use_uid=True, try:
ssl=True, client = ProxyIMAPClient(
timeout=self.timeout, host=imap_server,
) proxy=self.proxy,
client.login(self.account.login, self.account.password) subjects=self._subjects,
logger.info("[%s] Connecté. Sujets recherchés : %s", self.account.login, self._subjects) use_uid=True,
return client ssl=True,
timeout=self.timeout,
)
client.login(self.account.login, self.account.password)
logger.info(
"[%s] Connecté (tentative %d). Sujets recherchés : %s",
self.account.login, attempt, self._subjects,
)
return client
except Exception as exc:
last_exc = exc
logger.warning(
"[%s] Échec connexion/login (tentative %d/%d) : %s",
self.account.login, attempt, self.max_retries, exc,
)
if attempt < self.max_retries:
delay = self.retry_delay * (2 * (attempt - 1))
logger.info(
"[%s] Nouvelle tentative dans %.1f s…",
self.account.login, delay,
)
time.sleep(delay)
raise ConnectionError(
f"[{self.account.login}] Impossible de se connecter après "
f"{self.max_retries} tentative(s). Dernière erreur : {last_exc}"
) from last_exc
# ── Lecture des dossiers ───────────────────────────────── # ── Lecture des dossiers ─────────────────────────────────
@@ -414,13 +434,15 @@ class ProxyMailReader:
# ── Lecture des messages ───────────────────────────────── # ── Lecture des messages ─────────────────────────────────
def _read_folder( def _read_folder(
self, self,
client: ProxyIMAPClient, client: ProxyIMAPClient,
folder: str, folder: str,
since: Optional[datetime.datetime] = None, since: Optional[datetime.datetime] = None,
) -> List[MailResult]: ) -> List[MailResult]:
results: List[MailResult] = [] results: List[MailResult] = []
since = since or datetime.datetime.today() since = since or datetime.datetime.today()
# 用于去重:同一主题+发信人只读第一封
seen_subject_from: set = set()
try: try:
client.select_folder(folder, readonly=True) client.select_folder(folder, readonly=True)
@@ -439,7 +461,7 @@ class ProxyMailReader:
if not uids: if not uids:
return results return results
print("uids {}".format(uids))
logger.info("[%s] %d message(s) dans '%s'", logger.info("[%s] %d message(s) dans '%s'",
self.account.login, len(uids), folder) self.account.login, len(uids), folder)
@@ -453,6 +475,19 @@ class ProxyMailReader:
subject = em.get("Subject", "") subject = em.get("Subject", "")
from_addr = em.get("From", "") from_addr = em.get("From", "")
to_addr = em.get("To", self.account.login) to_addr = em.get("To", self.account.login)
message_id = em.get("Message-ID", "").strip()
print("subject {}".format(subject))
print("message_id {}".format(message_id))
# 去重:同一主题+发信人只读第一封
dedup_key = (subject, from_addr)
if dedup_key in seen_subject_from:
logger.debug(
"[%s] Doublon ignoré (même sujet et expéditeur) dans '%s': %s",
self.account.login, folder, subject[:50]
)
continue
seen_subject_from.add(dedup_key)
# Filtrer : on ne garde que les emails correspondant aux sujets/expéditeurs configurés # Filtrer : on ne garde que les emails correspondant aux sujets/expéditeurs configurés
is_validation = ( is_validation = (
@@ -468,13 +503,10 @@ class ProxyMailReader:
subject=subject, subject=subject,
from_address=from_addr, from_address=from_addr,
to_address=to_addr, to_address=to_addr,
body=body body=body,
message_id=message_id
) )
results.append(result) results.append(result)
logger.info(
"[%s] Email trouvé (uid=%s) — URLs : %s",
self.account.login, uid
)
except Exception as exc: except Exception as exc:
logger.warning( logger.warning(
"[%s] Erreur traitement uid=%s : %s", "[%s] Erreur traitement uid=%s : %s",
@@ -486,9 +518,9 @@ class ProxyMailReader:
# ── Point d'entrée public ──────────────────────────────── # ── Point d'entrée public ────────────────────────────────
def read( def read(
self, self,
since: Optional[datetime.datetime] = None, since: Optional[datetime.datetime] = None,
skip_folders: Optional[List[str]] = None, skip_folders: Optional[List[str]] = None,
) -> List[MailResult]: ) -> List[MailResult]:
""" """
Se connecte au serveur IMAP via le proxy et retourne la liste Se connecte au serveur IMAP via le proxy et retourne la liste
@@ -498,12 +530,19 @@ class ProxyMailReader:
---------- ----------
since : datetime, optional — date de début de recherche since : datetime, optional — date de début de recherche
skip_folders : list[str], optional — dossiers à ignorer skip_folders : list[str], optional — dossiers à ignorer
(défaut : ["Sent", "Drafts", "Trash", "Junk", "Spam"]) (défaut : ["Sent", "Drafts", "Trash", "Junk", "Spam",
"[Gmail]/All Mail", "[Gmail]/Starred",
"[Gmail]/Important"])
""" """
if skip_folders is None: if skip_folders is None:
skip_folders = ["Sent", "Drafts", "Trash", "Junk", "Spam"] skip_folders = [
"Sent", "Drafts", "Trash", "Junk", "Spam",
# Dossiers Gmail qui dupliquent le contenu d'INBOX
"[Gmail]/All Mail", "[Gmail]/Starred", "[Gmail]/Important",
]
all_results: List[MailResult] = [] all_results: List[MailResult] = []
seen_message_ids: set = set() # déduplication inter-dossiers
client = self._connect() client = self._connect()
try: try:
@@ -515,7 +554,18 @@ class ProxyMailReader:
logger.debug("[%s] Dossier ignoré : %s", logger.debug("[%s] Dossier ignoré : %s",
self.account.login, folder) self.account.login, folder)
continue continue
all_results.extend(self._read_folder(client, folder, since))
for result in self._read_folder(client, folder, since):
dedup_key = _dedup_key(result)
if dedup_key in seen_message_ids:
logger.debug(
"[%s] Doublon ignoré (clé=%s) dans '%s'",
self.account.login, str(dedup_key)[:40], folder,
)
continue
seen_message_ids.add(dedup_key)
all_results.append(result)
finally: finally:
try: try:
client.logout() client.logout()
@@ -533,11 +583,11 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
def read_multiple_accounts( def read_multiple_accounts(
accounts: List[MailAccount], accounts: List[MailAccount],
proxy: ProxyConfig, proxy: ProxyConfig,
since: Optional[datetime.datetime] = None, since: Optional[datetime.datetime] = None,
max_workers: int = 10, max_workers: int = 10,
timeout: float = 30.0, timeout: float = 30.0,
) -> List[MailResult]: ) -> List[MailResult]:
""" """
Lit plusieurs comptes email en parallèle via le même proxy. Lit plusieurs comptes email en parallèle via le même proxy.
@@ -588,7 +638,7 @@ if __name__ == "__main__":
# ── 2. Définir les comptes à lire ──────────────────────── # ── 2. Définir les comptes à lire ────────────────────────
accounts = [ accounts = [
MailAccount(login="birgitnaya@gmx.net", password="XEeUF3Y1yaO"), MailAccount(login="birgitnaya@gmx.net", password="XEeUF3Y1yaO"),
# MailAccount(login="user@gmail.com", password="apppassword"), # MailAccount(login="user@gmail.com", password="apppassword"),
# MailAccount(login="user@outlook.com", password="password"), # MailAccount(login="user@outlook.com", password="password"),
] ]
@@ -603,16 +653,13 @@ if __name__ == "__main__":
) )
# ── 4. Afficher les résultats ──────────────────────────── # ── 4. Afficher les résultats ────────────────────────────
print(f"\n{'='*60}") print(f"\n{'=' * 60}")
print(f" {len(results)} email(s) de validation trouvé(s)") print(f" {len(results)} email(s) de validation trouvé(s)")
print(f"{'='*60}\n") print(f"{'=' * 60}\n")
for r in results: for r in results:
print(f" Compte : {r.account}") print(f" Compte : {r.account}")
print(f" De : {r.from_address}") print(f" De : {r.from_address}")
print(f" Sujet : {r.subject}") print(f" Sujet : {r.subject}")
print(f" URLs : {r.validation_urls or 'aucune'}") print(f" URLs : {r.validation_urls or 'aucune'}")
print(f" {'-'*56}") print(f" {'-' * 56}")