All following posts were imported from use.perl.org

2010-09-04
⇦
⇧
⇨

All posts follwing this post were imported from the defunct use.perl.org. The import was done on 27th June 2011, but I put this notice in here as a clean seperator. I also imported the comments various people posted to various blog (or journal...) entries.

I reused the harvester presented here as-is, but adapted extractor a bit (to also fetch comments, and to write in a format suitable to importing it here). Thanks to Yanick Champoux for the code, and here's mine:

#!/usr/bin/perl
use 5.10.0;
use strict;
use warnings;
use utf8;
use DateTime;
use pQuery;

my %months = (
    Jan=>1,
    Feb=>2,
    Mar=>3,
    Apr=>4,
    May=>5,
    Jun=>6,
    Jul=>7,
    Aug=>8,
    Sep=>9,
    Oct=>10,
    Nov=>11,
    Dec=>12,
);
my %seen;
foreach my $file (@ARGV) {
    next unless -e $file;
    next unless $file =~ /^\d+$/;
    say $file;
    open(my $fh,'<',$file) || die $!;
    my $p = pQuery(join('',<$fh>));

    my $title = $p->find('.title h3')->get(1)->innerHTML;
    my $id = lc($title);

    $id=~s/[^\w\d\_]/_/g;
    $id=~s/_+/_/g;
    $id=~s/_$//g;
    $id=~s/^_//g;

    while ($seen{$id}++) {
        say "dup id $id";
        if ($id =~/_\d+$/) {
            $id++;
        }
        else {
            $id .= '_1';
        }
        say "check $id";
    }

    my ( $month, $day, $year ) = $p->find('.journaldate')->html() =~ /(\w{3})\w* 0?(\d+), (\d{4})$/;
    my ( $time ) =  $p->find('.details')->html();
    my ($hour,$min,$apm) = $p->find('.details')->html() =~ /(\d\d):(\d\d) (\w\w)/;
    $hour+=12 if $apm eq 'PM';
    $hour=23 if $hour == 24;
    my $date = DateTime->new(year=>$year,month=>$months{$month},day=>$day,hour=>$hour,minute=>$min);
    my $source =  $p->find('.h-inline a')->get(0)->getAttribute('href');

    my $content = $p->find('.intro')->get(0)->innerHTML;

    my @comments;

    $p->find('.comment')->each(sub {
        my $c = $_;
        my $c_url = pQuery($c)->find('h4 a')->get(0)->getAttribute('href');
        $c_url = 'http:'.$c_url unless $c_url =~/^http/;
        my $c_subject = pQuery($c)->find('h4 a')->get(0)->innerHTML;
        my $c_author = pQuery($c)->find('.details a')->get(0)->innerHTML;
        my $c_body = pQuery($c)->find('.commentBody div')->get(0)->innerHTML;
        if ($c_body && $c_body !~ /^\s?/) {
            $c_body = "
$c_body";
        }
        $c_author=~s/ \(\d+\)//;
        push(@comments,qq{$c_author: $c_subject (orignal post)$c_body});
    });

    open(my $out,">:encoding(UTF-8)","out/$id.txt") || die $!;
    say $out "title: $title";
    say $out "date: ".$date->iso8601;
    say $out "converter: html";
    print $out "\n";
    say $out "".$content;
    say $out "
Original: http:$source";
    if (@comments) {
        say $out "Legacy comments";
        say $out join("\n",@comments);
    }
    close $out;
}