package MHTML; use base Exporter; our @EXPORT = qw(get_mht); use strict; use Carp qw(croak cluck); use URI; use File::Basename; use HTML::LinkExtor; use LWP::UserAgent; use MIME::Lite; my $QUOTED_STR = qr/ \" ([^\"]*) \" | \' ([^\']*) \' /x; my $URL_RE = qr/ url \s* \( \s* (?: $QUOTED_STR | (.*?) ) \s* \) /ix; my $IMPORT_RE = qr/ \@import (?: \s+ $URL_RE | \s* $QUOTED_STR | \s+ (\S+) ) /xi; sub new { my ( undef, $url ) = @_; return get_mht($url); } sub get_mht { my $target = shift or croak 'A target is not provided.'; my ( $base_url, $ua, $res, @files ); if ( ref $target->[0] eq 'LWP::UserAgent' and ref $target->[1] eq 'HTTP::Response' ) { $ua = $target->[0]; $res = $target->[1]; $base_url = $res->base(); } else { $ua = new LWP::UserAgent; $base_url = $target; my $req = HTTP::Request->new( GET => $base_url ); $req->referer($base_url); $res = $ua->request($req); $res->is_success or croak $res->status_line; } my %url_flag = ( $base_url => undef ); my $mhtml = MIME::Lite->new( Subject => $res->title(), Type => 'multipart/related', ); $mhtml->delete('X-Mailer'); $mhtml->delete('Date'); $mhtml->attr( "Content-Location" => $base_url ); my $store_filedata = sub { my ( $tag, %attr ) = @_; if ( $tag eq 'img' or $tag eq 'link' or $tag eq 'script' ) { foreach my $src ( values %attr ) { push( @files, [ $tag, $src ] ); } } }; my $analy_css = sub { my ( $content, $base ) = @_; my %tmp = (); my @files = grep( ( $_ and !$tmp{$_}++ ), $content =~ m/$IMPORT_RE/ig ); while ( my $file = shift @files ) { $store_filedata->( 'link', undef => URI->new_abs( $file, $base ) ); } @files = grep( ( $_ and !$tmp{$_}++ ), $content =~ m/$URL_RE/ig ); while ( my $file = shift @files ) { $store_filedata->( 'img', undef => URI->new_abs( $file, $base ) ); } }; my $attach = sub { my ( $url, $res ) = @_; my $part = $mhtml->attach( Type => $res->header('Content-Type') || 'AUTO', Data => $res->content(), Filename => basename($url), Disposition => 'inline', ); $part->attr( 'content-type' => $res->header('Content-Type') ); $part->attr( 'Content-Location' => $url ); }; my $request = sub { my ( $tag, $url ) = @_; exists $url_flag{$url} and return; $url_flag{$url} = undef; my $req = HTTP::Request->new( GET => $url ); $req->referer($base_url); my $res = $ua->request($req); if ( $tag eq 'link' and $res->header('Content-Type') =~ m|text/css| ) { $analy_css->( $res->content(), $url ); } if ( $res->is_success ) { $attach->( $url, $res ); } else { cluck $res->status_line; } }; my $part = $mhtml->attach( Type => 'text/html', Data => $res->content(), Filename => basename($base_url), Disposition => 'inline', ); $part->attr( 'content-type' => $res->header('Content-Type') ); $part->attr( 'Content-Location' => $base_url ); my $content = $res->content(); my @style_tags = $content =~ m|(.+?)|igs; while ( my $style_tag = shift @style_tags ) { $analy_css->( $style_tag, $base_url ); } my $parser = HTML::LinkExtor->new( $store_filedata, $base_url ); $parser->parse($content); while ( my $file = shift @files ) { my ( $tag, $url ) = @$file; $request->( $tag, $url ); } return $mhtml; } 1;