All following posts were imported from use.perl.org

All posts follwing this post were imported from the defunct use.perl.org. The import was done on 27th June 2011, but I put this notice in here as a clean seperator. I also imported the comments various people posted to various blog (or journal...) entries.

I reused the harvester presented here as-is, but adapted extractor a bit (to also fetch comments, and to write in a format suitable to importing it here). Thanks to Yanick Champoux for the code, and here's mine:

#!/usr/bin/perl
use 5.10.0;
use strict;
use warnings;
use utf8;
use DateTime;
use pQuery;

my %months = (
    Jan=>1,
    Feb=>2,
    Mar=>3,
    Apr=>4,
    May=>5,
    Jun=>6,
    Jul=>7,
    Aug=>8,
    Sep=>9,
    Oct=>10,
    Nov=>11,
    Dec=>12,
);
my %seen;
foreach my $file (@ARGV) {
    next unless -e $file;
    next unless $file =~ /^\d+$/;
    say $file;
    open(my $fh,'<',$file) || die $!;
    my $p = pQuery(join('',<$fh>));

    my $title = $p->find('.title h3')->get(1)->innerHTML;
    my $id = lc($title);

    $id=~s/[^\w\d\_]/_/g;
    $id=~s/_+/_/g;
    $id=~s/_$//g;
    $id=~s/^_//g;

    while ($seen{$id}++) {
        say "dup id $id";
        if ($id =~/_\d+$/) {
            $id++;
        }
        else {
            $id .= '_1';
        }
        say "check $id";
    }

    my ( $month, $day, $year ) = $p->find('.journaldate')->html() =~ /(\w{3})\w* 0?(\d+), (\d{4})$/;
    my ( $time ) =  $p->find('.details')->html();
    my ($hour,$min,$apm) = $p->find('.details')->html() =~ /(\d\d):(\d\d) (\w\w)/;
    $hour+=12 if $apm eq 'PM';
    $hour=23 if $hour == 24;
    my $date = DateTime->new(year=>$year,month=>$months{$month},day=>$day,hour=>$hour,minute=>$min);
    my $source =  $p->find('.h-inline a')->get(0)->getAttribute('href');

    my $content = $p->find('.intro')->get(0)->innerHTML;

    my @comments;

    $p->find('.comment')->each(sub {
        my $c = $_;
        my $c_url = pQuery($c)->find('h4 a')->get(0)->getAttribute('href');
        $c_url = 'http:'.$c_url unless $c_url =~/^http/;
        my $c_subject = pQuery($c)->find('h4 a')->get(0)->innerHTML;
        my $c_author = pQuery($c)->find('.details a')->get(0)->innerHTML;
        my $c_body = pQuery($c)->find('.commentBody div')->get(0)->innerHTML;
        if ($c_body && $c_body !~ /^\s?

/) { $c_body = "

$c_body

"; } $c_author=~s/ \(\d+\)//; push(@comments,qq{
$c_author: $c_subject (orignal post)
$c_body}); }); open(my $out,">:encoding(UTF-8)","out/$id.txt") || die $!; say $out "title: $title"; say $out "date: ".$date->iso8601; say $out "converter: html"; print $out "\n"; say $out "

".$content; say $out "

Original: http:$source

"; if (@comments) { say $out "

Legacy comments

"; say $out join("\n",@comments); } close $out; }