All following posts were imported from use.perl.org
All posts follwing this post were imported from the defunct use.perl.org. The import was done on 27th June 2011, but I put this notice in here as a clean seperator. I also imported the comments various people posted to various blog (or journal...) entries.
I reused the harvester presented here as-is, but adapted extractor a bit (to also fetch comments, and to write in a format suitable to importing it here). Thanks to Yanick Champoux for the code, and here's mine:
#!/usr/bin/perl use 5.10.0; use strict; use warnings; use utf8; use DateTime; use pQuery; my %months = ( Jan=>1, Feb=>2, Mar=>3, Apr=>4, May=>5, Jun=>6, Jul=>7, Aug=>8, Sep=>9, Oct=>10, Nov=>11, Dec=>12, ); my %seen; foreach my $file (@ARGV) { next unless -e $file; next unless $file =~ /^\d+$/; say $file; open(my $fh,'<',$file) || die $!; my $p = pQuery(join('',<$fh>)); my $title = $p->find('.title h3')->get(1)->innerHTML; my $id = lc($title); $id=~s/[^\w\d\_]/_/g; $id=~s/_+/_/g; $id=~s/_$//g; $id=~s/^_//g; while ($seen{$id}++) { say "dup id $id"; if ($id =~/_\d+$/) { $id++; } else { $id .= '_1'; } say "check $id"; } my ( $month, $day, $year ) = $p->find('.journaldate')->html() =~ /(\w{3})\w* 0?(\d+), (\d{4})$/; my ( $time ) = $p->find('.details')->html(); my ($hour,$min,$apm) = $p->find('.details')->html() =~ /(\d\d):(\d\d) (\w\w)/; $hour+=12 if $apm eq 'PM'; $hour=23 if $hour == 24; my $date = DateTime->new(year=>$year,month=>$months{$month},day=>$day,hour=>$hour,minute=>$min); my $source = $p->find('.h-inline a')->get(0)->getAttribute('href'); my $content = $p->find('.intro')->get(0)->innerHTML; my @comments; $p->find('.comment')->each(sub { my $c = $_; my $c_url = pQuery($c)->find('h4 a')->get(0)->getAttribute('href'); $c_url = 'http:'.$c_url unless $c_url =~/^http/; my $c_subject = pQuery($c)->find('h4 a')->get(0)->innerHTML; my $c_author = pQuery($c)->find('.details a')->get(0)->innerHTML; my $c_body = pQuery($c)->find('.commentBody div')->get(0)->innerHTML; if ($c_body && $c_body !~ /^\s?/) { $c_body = "
$c_body
"; } $c_author=~s/ \(\d+\)//; push(@comments,qq{$c_author: $c_subject (orignal post)
$c_body}); }); open(my $out,">:encoding(UTF-8)","out/$id.txt") || die $!; say $out "title: $title"; say $out "date: ".$date->iso8601; say $out "converter: html"; print $out "\n"; say $out "".$content; say $out "
Original: http:$source
"; if (@comments) { say $out "Legacy comments
"; say $out join("\n",@comments); } close $out; }