Post-process markdown to clean up noisy HTML email output
Strip images, simplify links to just text, remove very long bare URLs, and collapse excessive blank lines for a cleaner preview pane. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
23f179df24
commit
3e647dbe52
1 changed files with 19 additions and 1 deletions
20
src/inbox.rs
20
src/inbox.rs
|
|
@ -208,7 +208,7 @@ fn extract_plain_text(raw: &[u8]) -> Result<String, String> {
|
|||
// Fall back to text/html converted to markdown
|
||||
if let Some(html) = find_part(&parsed, "text/html") {
|
||||
let md = html2md::rewrite_html(&html, false);
|
||||
return Ok(clean_text(&md));
|
||||
return Ok(clean_markdown(&clean_text(&md)));
|
||||
}
|
||||
// Last resort: top-level body
|
||||
parsed.get_body().map(|s| clean_text(&s)).map_err(|e| e.to_string())
|
||||
|
|
@ -225,6 +225,24 @@ fn clean_text(text: &str) -> String {
|
|||
}
|
||||
}
|
||||
|
||||
/// Clean up markdown converted from HTML emails:
|
||||
/// - Strip image references 
|
||||
/// - Simplify links [text](url) → text
|
||||
/// - Remove bare long URLs
|
||||
/// - Collapse runs of 3+ blank lines to 2
|
||||
fn clean_markdown(text: &str) -> String {
|
||||
let re_img = regex::Regex::new(r"!\[[^\]]*\]\([^)]*\)").unwrap();
|
||||
let re_link = regex::Regex::new(r"\[([^\]]*)\]\([^)]*\)").unwrap();
|
||||
let re_bare_url = regex::Regex::new(r"https?://\S{80,}").unwrap();
|
||||
let re_blank_lines = regex::Regex::new(r"\n{3,}").unwrap();
|
||||
|
||||
let result = re_img.replace_all(text, "");
|
||||
let result = re_link.replace_all(&result, "$1");
|
||||
let result = re_bare_url.replace_all(&result, "");
|
||||
let result = re_blank_lines.replace_all(&result, "\n\n");
|
||||
result.trim().to_string()
|
||||
}
|
||||
|
||||
fn find_part(mail: &mailparse::ParsedMail, mime_type: &str) -> Option<String> {
|
||||
let content_type = mail.ctype.mimetype.to_lowercase();
|
||||
if content_type == mime_type {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue