All posts follwing this post were imported from the defunct use.perl.org. The import was done on 27th June 2011, but I put this notice in here as a clean seperator. I also imported the comments various people posted to various blog (or journal...) entries.
I reused the harvester presented here as-is, but adapted extractor a bit (to also fetch comments, and to write in a format suitable to importing it here). Thanks to Yanick Champoux for the code, and here's mine:
#!/usr/bin/perl
use 5.10.0;
use strict;
use warnings;
use utf8;
use DateTime;
use pQuery;
my %months = (
Jan=>1,
Feb=>2,
Mar=>3,
Apr=>4,
May=>5,
Jun=>6,
Jul=>7,
Aug=>8,
Sep=>9,
Oct=>10,
Nov=>11,
Dec=>12,
);
my %seen;
foreach my $file (@ARGV) {
next unless -e $file;
next unless $file =~ /^\d+$/;
say $file;
open(my $fh,'<',$file) || die $!;
my $p = pQuery(join('',<$fh>));
my $title = $p->find('.title h3')->get(1)->innerHTML;
my $id = lc($title);
$id=~s/[^\w\d\_]/_/g;
$id=~s/_+/_/g;
$id=~s/_$//g;
$id=~s/^_//g;
while ($seen{$id}++) {
say "dup id $id";
if ($id =~/_\d+$/) {
$id++;
}
else {
$id .= '_1';
}
say "check $id";
}
my ( $month, $day, $year ) = $p->find('.journaldate')->html() =~ /(\w{3})\w* 0?(\d+), (\d{4})$/;
my ( $time ) = $p->find('.details')->html();
my ($hour,$min,$apm) = $p->find('.details')->html() =~ /(\d\d):(\d\d) (\w\w)/;
$hour+=12 if $apm eq 'PM';
$hour=23 if $hour == 24;
my $date = DateTime->new(year=>$year,month=>$months{$month},day=>$day,hour=>$hour,minute=>$min);
my $source = $p->find('.h-inline a')->get(0)->getAttribute('href');
my $content = $p->find('.intro')->get(0)->innerHTML;
my @comments;
$p->find('.comment')->each(sub {
my $c = $_;
my $c_url = pQuery($c)->find('h4 a')->get(0)->getAttribute('href');
$c_url = 'http:'.$c_url unless $c_url =~/^http/;
my $c_subject = pQuery($c)->find('h4 a')->get(0)->innerHTML;
my $c_author = pQuery($c)->find('.details a')->get(0)->innerHTML;
my $c_body = pQuery($c)->find('.commentBody div')->get(0)->innerHTML;
if ($c_body && $c_body !~ /^\s?/) {
$c_body = "
$c_body
";
}
$c_author=~s/ \(\d+\)//;
push(@comments,qq{$c_author: $c_subject (orignal post)
$c_body});
});
open(my $out,">:encoding(UTF-8)","out/$id.txt") || die $!;
say $out "title: $title";
say $out "date: ".$date->iso8601;
say $out "converter: html";
print $out "\n";
say $out "".$content;
say $out "
Original: http:$source
";
if (@comments) {
say $out "Legacy comments
";
say $out join("\n",@comments);
}
close $out;
}