#!/usr/bin/perl -w # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License, Version 1.0 only # (the "License"). You may not use this file except in compliance # with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # extract a journalspace blog entry/entries into an rss file # that can be imported into something like wordpress # # the html parsing is accurate as of the 11th of August 2007, # but may have changed since then, but the basic ideas should # be the same, so feel free to modify. # # couple of caveats - you have to be logged into journalspace, # with the "keep me logged in" selection (so LWP can use the # cookie), and if an entry id doesn't exist we will croak and # die, it can be coded around, but for a once off its really # not worth the effort. # # also some dates can't be picked up, again due to some # strangeness from journalspace, so they will appear as having # been posted at epoch time, lets call it another tradeoff. # # fintanr 11th August 2007 # use LWP; use HTTP::Cookies; use HTML::TreeBuilder; use Getopt::Std; use File::Basename; use XML::RSS; use Date::Manip; use strict; use vars qw/$opt_u $opt_c $opt_p $opt_s $opt_e $opt_o $opt_v $opt_h/; getopts("u:c:p:s:e:o:vh"); if ( defined($opt_h) || !defined($opt_u) || !defined($opt_c) ) { usage(); exit(1); } if ( !defined($opt_p) && ( !defined($opt_s) || !defined($opt_e) ) ) { usage(); exit(1); } # you might want to edit this # my $blogtitle = "0 + 2 = 1"; my $jsBasePage = sprintf("%s/?entryid=", $opt_u); my $jsEditPage = sprintf("%s/blog/new_entry_original.php?edit=", $opt_u); my $cookieFile = $opt_c; my $start = $opt_p; my $end = $opt_p; my $outFile = "/tmp/$$.rss.xml"; if ( defined($opt_s) ) { $start = $opt_s; } if ( defined($opt_e) ) { $end = $opt_e; } if ( defined($opt_o) ) { $outFile = $opt_o; } else { printf("Outputting to %s\n", $outFile); } my $cookies = HTTP::Cookies::Netscape->new( file => $cookieFile) || die ("failed to read $cookieFile"); my $ua = LWP::UserAgent->new(); $ua->cookie_jar($cookies); my $time = localtime(); my $now = UnixDate(ParseDate($time), "%g"); my $rss = XML::RSS->new(version => '2.0'); $rss->channel( title => $blogtitle, language => 'en', description => $blogtitle, link => $opt_u, generator => "perl with special guest 'escape from journalspace'", lastBuildDate => $now ); for ( my $i = $start; $i <= $end; $i++ ) { # the edit entry page doesn't give us any details on the # original date that we posted, so we need to pull that out # of the regular pages... # my $tmpurl = sprintf("%s%s", $jsBasePage, $i); $opt_v ? printf("Processing %s\n", $tmpurl) : 1; my ( $title, $category, $entry ) = extractEntry($jsEditPage, $i); my ( $postdate ) = extractDate($tmpurl); # move the date into a format we can use # we don't have a proper timestamp, so we will just set everything # to midnight my $date = UnixDate(ParseDate($postdate), "%g"); # okay, we have enough data to build up an item and add it into our # RSS feed $rss->add_item( title => $title, description => $entry, pubDate => $date, category => $category, guid => $tmpurl ); } # workaround add_item doing the right thing to the entry contents # and put back in <, > and ". We lose links and formatting all over # the place otherwise. my $outputRSS = $rss->as_string; $outputRSS =~ s/<//g; $outputRSS =~ s/"/"/g; open(OUT, ">$outFile") || die("Can't open $outFile"); print OUT $outputRSS; close(OUT); sub extractEntry { my ( $url, $entryNo ) = @_; my $tempurl = sprintf("%s%s", $url, $entryNo); my $response = $ua->get($tempurl); my $content = $response->content(); # okay, lets run our content into a HTML::TreeBuilder object, and extract # out the parts we are interested in # # we have the luxury of a couple of assumptions here # # 1. The title is always going to be of input type=text with a name subj # 2. text is going to be in a textarea with a name of msgtxt # 3. category - checkbox that is allways going to have CATSET, and is checked my $tree = HTML::TreeBuilder->new_from_content($content); my $title = (($tree->look_down('name' => qr/subj/))[0])->attr('value'); my $entry = (($tree->look_down('name' => qr/msgtext/))[0])->as_text; # we will lose some category details for posts that were multi category, but # its a fair trade off. my $category = "Uncategorized"; my @categories = $tree->look_down('value' => qr/CATSET/); foreach my $cat ( @categories ) { if ( $cat->attr('checked') ) { $category = $cat->attr('name'); $category =~ s/cat-//; } } $tree->delete(); return($title, $category, $entry); } sub extractDate { my ( $url ) = @_; my $response = $ua->get($url); my $content = $response->content(); my $tree = HTML::TreeBuilder->new_from_content($content); # dates all seem to occur on the first bold tag, so we just # look there. my $date = (($tree->look_down('_tag' => 'b'))[0])->as_text; $tree->delete(); $date =~ s/posted //; return($date); } sub usage { my $self = basename(__FILE__); printf("\nUsage : %s -u -c -p ", $self); printf("|-s -e [-o outfile][-v][-h]\n\n"); printf("Cookie file is your Mozilla Cookie file\n\n"); }