#!/usr/bin/perl # Make an email, (e.g., RISKS.org submission) into UTF-8, no matter what # charset it arrived with. Also strips boring headers. # Depends on: apt-get install libemail-mime-perl # Usage: pipe a message into this program. # Of course we are talking about piping in a single raw email message, # with all its original headers. As those headers tell us what charset # it is in, which we need to know in order to be able to convert to # UTF-8. # # Copyright : https://www.fsf.org/copyleft/gpl.html # Author : Dan Jacobson -- jidanni@jidanni.org https://www.jidanni.org/ # Latest Version : https://www.jidanni.org/comp/mail/riskfix # Created On : Sun Jun 13 19:22:14 2021 # Last Modified On: Mon Jan 17 00:53:09 2022 # Update Count : 162 use strict; use warnings FATAL => q(all); use Email::MIME; binmode STDOUT, ":utf8"; binmode STDERR, ":utf8"; my $message; ## my $boggy; while (<>) { # Some users e.g., send utf-8 mail, but somehow their header says something else. # Therefore the Moderator, upon spotting a Mess, needs to edit the # headers by hand, and re-pipe it through this program. # Or we could edit them for him: # if (/^From: .*dannyb\@panix.com/) { $boggy++ } # if ( $boggy && /CHARSET/ ) { # s/iso-8859-1/utf-8/; # } # But this is surely a one-time incident, as article submitters # here, all being computer professionals, can be convinced to # reconfigure their systems. # One would think we can use Encode::Guess to help spot such incidents... # # But like that man page says, "ISO-8859 series is just too successful for # most cases (because it fills almost all code points in \x00-\xff)." # Anyway, e.g., # U+00F1 LATIN SMALL LETTER N WITH TILDE: UTF-8: c3 b1 # is also # $ man ISO_8859-1 # C3 LATIN CAPITAL LETTER A WITH TILDE # B1 PLUS-MINUS SIGN # so we would have to delve into sniffing the likeliness of juxtapositions... $message .= $_; } my $parsed = Email::MIME->new($message); my %p = $parsed->header_str_pairs; for (qw/From Subject Date/) { printf "%s: %s\n", $_, $p{$_}; } print "\n"; my $txt; my $html; $parsed->walk_parts( sub { my ($part) = @_; if ( !$part->content_type ) { die "$0: Hey, there was no content_type etc. header!"; } if ( $part->content_type =~ m[text/html]i ) { $html++; } elsif ( $part->content_type =~ m[text/plain]i ) { $txt = $part->body_str; #converts to Unicode return; } } ); if ( $html && !$txt ) { die "$0: Gosh, the message only has a HTML part and well, we aren't ready for that yet."; ## Perhaps start with /usr/share/doc/libhtml-parser-perl/examples/htext ## Or maybe: ## Package: mailtextbody : tool to return the body of an email message ## Mailtextbody reads a complete email message on stdin and returns the body ## on stdout. Technically speaking, it returns the first decoded text/plain... } $_ = $txt || $parsed->body_str; #converts to Unicode s/^\n+//; #no beginning blank lines s/[ \t\r]+$//gm; #strip trailing blanks etc. s{\n{3,}}{\n\n}g; #compress empty lines s{\n{2,}$}{\n}; #no ending blank lines my $entity_count; entitiy_check($_) unless $entity_count; print; ## (For going further and stripping ## https://example.com/bla?junk=at-the-end-of-URLs, I would recommend ## a separate program...) sub entitiy_check { my $s = $_[0]; $s =~ s/https?:\S+//g; if ( $s =~ /&\w+;/ ) { print STDERR <