package MHTML;
use base Exporter;
our @EXPORT = qw(get_mht);
use strict;
use Carp qw(croak cluck);
use URI;
use File::Basename;
use HTML::LinkExtor;
use LWP::UserAgent;
use MIME::Lite;
my $QUOTED_STR = qr/ \" ([^\"]*) \" | \' ([^\']*) \' /x;
my $URL_RE = qr/ url \s* \(
\s* (?: $QUOTED_STR | (.*?) ) \s*
\)
/ix;
my $IMPORT_RE = qr/
\@import (?:
\s+ $URL_RE |
\s* $QUOTED_STR |
\s+ (\S+)
)
/xi;
sub new {
my ( undef, $url ) = @_;
return get_mht($url);
}
sub get_mht {
my $target = shift or croak 'A target is not provided.';
my ( $base_url, $ua, $res, @files );
if ( ref $target->[0] eq 'LWP::UserAgent'
and ref $target->[1] eq 'HTTP::Response' )
{
$ua = $target->[0];
$res = $target->[1];
$base_url = $res->base();
}
else {
$ua = new LWP::UserAgent;
$base_url = $target;
my $req = HTTP::Request->new( GET => $base_url );
$req->referer($base_url);
$res = $ua->request($req);
$res->is_success or croak $res->status_line;
}
my %url_flag = ( $base_url => undef );
my $mhtml = MIME::Lite->new(
Subject => $res->title(),
Type => 'multipart/related',
);
$mhtml->delete('X-Mailer');
$mhtml->delete('Date');
$mhtml->attr( "Content-Location" => $base_url );
my $store_filedata = sub {
my ( $tag, %attr ) = @_;
if ( $tag eq 'img' or $tag eq 'link' or $tag eq 'script' ) {
foreach my $src ( values %attr ) {
push( @files, [ $tag, $src ] );
}
}
};
my $analy_css = sub {
my ( $content, $base ) = @_;
my %tmp = ();
my @files = grep( ( $_ and !$tmp{$_}++ ), $content =~ m/$IMPORT_RE/ig );
while ( my $file = shift @files ) {
$store_filedata->( 'link', undef => URI->new_abs( $file, $base ) );
}
@files = grep( ( $_ and !$tmp{$_}++ ), $content =~ m/$URL_RE/ig );
while ( my $file = shift @files ) {
$store_filedata->( 'img', undef => URI->new_abs( $file, $base ) );
}
};
my $attach = sub {
my ( $url, $res ) = @_;
my $part = $mhtml->attach(
Type => $res->header('Content-Type') || 'AUTO',
Data => $res->content(),
Filename => basename($url),
Disposition => 'inline',
);
$part->attr( 'content-type' => $res->header('Content-Type') );
$part->attr( 'Content-Location' => $url );
};
my $request = sub {
my ( $tag, $url ) = @_;
exists $url_flag{$url} and return;
$url_flag{$url} = undef;
my $req = HTTP::Request->new( GET => $url );
$req->referer($base_url);
my $res = $ua->request($req);
if ( $tag eq 'link' and $res->header('Content-Type') =~ m|text/css| ) {
$analy_css->( $res->content(), $url );
}
if ( $res->is_success ) {
$attach->( $url, $res );
}
else {
cluck $res->status_line;
}
};
my $part = $mhtml->attach(
Type => 'text/html',
Data => $res->content(),
Filename => basename($base_url),
Disposition => 'inline',
);
$part->attr( 'content-type' => $res->header('Content-Type') );
$part->attr( 'Content-Location' => $base_url );
my $content = $res->content();
my @style_tags = $content =~ m|(.+?)|igs;
while ( my $style_tag = shift @style_tags ) {
$analy_css->( $style_tag, $base_url );
}
my $parser = HTML::LinkExtor->new( $store_filedata, $base_url );
$parser->parse($content);
while ( my $file = shift @files ) {
my ( $tag, $url ) = @$file;
$request->( $tag, $url );
}
return $mhtml;
}
1;