diff --git a/src/inbox.rs b/src/inbox.rs index 96148e7..a80df32 100644 --- a/src/inbox.rs +++ b/src/inbox.rs @@ -208,7 +208,7 @@ fn extract_plain_text(raw: &[u8]) -> Result { // Fall back to text/html converted to markdown if let Some(html) = find_part(&parsed, "text/html") { let md = html2md::rewrite_html(&html, false); - return Ok(clean_text(&md)); + return Ok(clean_markdown(&clean_text(&md))); } // Last resort: top-level body parsed.get_body().map(|s| clean_text(&s)).map_err(|e| e.to_string()) @@ -225,6 +225,24 @@ fn clean_text(text: &str) -> String { } } +/// Clean up markdown converted from HTML emails: +/// - Strip image references ![...](...) +/// - Simplify links [text](url) → text +/// - Remove bare long URLs +/// - Collapse runs of 3+ blank lines to 2 +fn clean_markdown(text: &str) -> String { + let re_img = regex::Regex::new(r"!\[[^\]]*\]\([^)]*\)").unwrap(); + let re_link = regex::Regex::new(r"\[([^\]]*)\]\([^)]*\)").unwrap(); + let re_bare_url = regex::Regex::new(r"https?://\S{80,}").unwrap(); + let re_blank_lines = regex::Regex::new(r"\n{3,}").unwrap(); + + let result = re_img.replace_all(text, ""); + let result = re_link.replace_all(&result, "$1"); + let result = re_bare_url.replace_all(&result, ""); + let result = re_blank_lines.replace_all(&result, "\n\n"); + result.trim().to_string() +} + fn find_part(mail: &mailparse::ParsedMail, mime_type: &str) -> Option { let content_type = mail.ctype.mimetype.to_lowercase(); if content_type == mime_type {