# script for converting from ugly html files to Movable Type's import/export format # it will need extensive modification to accomodate your individual entry template # Written by Yami McMoots - http://greengabbro.net/ # ... and released to the public domain. I hope someone else finds it useful. #Path to test file: $testoutput = '/Users/maria/Sites/gabbro/googlecached/2006.txt'; $tmp = '/Users/maria/Sites/gabbro/googlecached/tmp.txt'; $ctmp = '/Users/maria/Sites/gabbro/googlecached/ctmp.txt'; $btmp = '/Users/maria/Sites/gabbro/googlecached/btmp.txt'; open(OUT, ">>$testoutput") || die "Error opening output: $!"; # Directory containing all the saved html files, and nothing else opendir(DIR,"/Users/maria/Sites/gabbro/googlecached/2006") || die "Error reading directory $!"; @files = readdir(DIR); foreach(@files) { $file = $_; open(DATA, "$file") || die "Error opening $file: $!"; # Entry/comment cycle: # 0: Nothing # 1: Just found the beginning of an entry/comment body. # 2: Just finished reading the body. The next line is metadata. # 3: Just finished writing metadata. Time to write the body. $entry = 0; $comment = 0; $entrymeta = 0; # Are there comments on this entry? $ctog = 0; while() { # read each line, one at a time, in turn. $L = $_; if( $L =~ m/<\/title>/ ) { # Entry title $L =~ s/\t//g; # Strip stupid tabs $L =~ s/<\/title>//; # Get rid of the end title tag print OUT "TITLE: "; print OUT $L; } if( $L =~ m/filed under (.*?)<\/a>/ ) { # Entry categories while( $L =~ m/(.*?)<\/a>/g ) { print OUT "CATEGORY: $1\n"; } $entry = 3; $comment = 0; } if( $entry == 2 || $comment == 2) { # Just finished an entry, time for metadata $match = 0; if( $entry == 2 && $L =~ /(\w+?) · (\d\d?):(\d\d) · (\d*) (\S{3}) (\d{4})/ ) { $author = $1; $hours = $2; $mins = $3; $day = $4; $month = $5; $year = $6; $entrymeta = 1; $match = 1; $entry = 0; } # Comment has an author URL elsif ( $L =~ m/(.*?)<\/a> · (\w*?) (\d{1,2})\w\w, (\d{4}) · (\d\d):(\d\d)/ && $comment == 2 ) { $url = $1; $author = $2; $month = $3; $day = $4; $year = $5; $hours = $6; $mins = $7; $match = 1; } # Comment with no author URL elsif ( $L =~ /(.*?) · (.*?) (\d{1,2}).., (\d{4}) · (\d\d):(\d\d)/ && $comment == 2 ) { $author = $1; $month = $2; $day = $3; $year = $4; $hours = $5; $mins = $6; $url = ''; $match = 1; } else { if( $entrymeta > 0 || $comment == 2 ) { print "Metadata regex failure on line $_"; } } # Month lookup if( $match == 1 ) { if ( $month eq "Jan" || $month eq "January" ) { $month = "01"; } elsif ( $month eq "Feb" || $month eq "February") { $month = "02"; } elsif ( $month eq "Mar" || $month eq "March" ) { $month = "03"; } elsif ( $month eq "Apr" || $month eq "April" ) { $month = "04"; } elsif ( $month eq "May" || $month eq "May" ) { $month = "05"; } elsif ( $month eq "Jun" || $month eq "June" ) { $month = "06"; } elsif ( $month eq "Jul" || $month eq "July" ) { $month = "07"; } elsif ( $month eq "Aug" || $month eq "August" ) { $month = "08"; } elsif ( $month eq "Sep" || $month eq "September" ) { $month = "09"; } elsif ( $month eq "Oct" || $month eq "October" ) { $month = "10"; } elsif ( $month eq "Nov" || $month eq "November" ) { $month = "11"; } elsif ( $month eq "Dec" || $month eq "December" ) { $month = "12"; } else { print "Month regex failure: $month\n"; } if ($day < 10) { $day = "0".$day; } if( $entrymeta == 1 ) { print OUT "AUTHOR: $author\n"; $author = ''; print OUT "DATE: $month/$day/$year $hours:$mins:00\n"; $month = ''; $day = ''; $year = ''; $hours = ''; $mins = ''; } if( $comment == 2) { open(CTMP, ">>$ctmp"); print CTMP "COMMENT:\n"; print CTMP "AUTHOR: $author\n"; $author = ''; print CTMP "DATE: $month/$day/$year $hours:$mins:00\n"; $month = ''; $day = ''; $year = ''; $hours = ''; $mins = ''; if ($url) { print CTMP "URL: $url\n"; } close(CTMP); } } if( $comment == 2 ) { $comment = 3; } $match = 0; $entrymeta = 0; } # Print the entry/comment body if( $entry == 3 ) { print OUT "-----\n"; print OUT "BODY:\n"; open(BTMP, "$btmp") || die "Error opening btmp $!"; while() { print OUT $_; } print OUT "-----\n"; close(BTMP); open(BTMP, ">$btmp"); print BTMP ""; close(BTMP); $entry = 0; if( $ctog == 1 ) { open(CTMP, "$ctmp") || die "Error opening ctmp $!"; while() { print OUT $_; } close(CTMP); open(CTMP, ">$ctmp"); print CTMP ""; close(CTMP); } } # Comment metadata parsed: write comment body to ctmp elsif( $comment == 3 ) { open(TMP, "$tmp") || die "Error opening tmp $!"; open(CTMP, ">>$ctmp") || die "Error opening ctmp $!"; while() { print CTMP $_; } print CTMP "-----\n"; close(TMP); close(CTMP); $comment = 0; open(TMP, ">$tmp") || die "Error opening tmp $!"; print TMP ""; close(TMP); } if( $entry == 1 && $L =~ m/<\/div>/) { # Entry body end $entry = 2; close(BTMP); } if( $comment == 1 && $L =~ m/
/) { # Comment body end $comment = 2; close(TMP); } # If we're in the entry body, print the line to the scratch file: if( $entry == 1 && $L =~ /\S/ ) { $L =~ s/\t//g; print BTMP $L; } # If we're in a comment body, print to comment scratch: if( $comment == 1 && $L =~ /\S/ ) { $L =~ s/<\/?p>//g; $L =~ s/\t//g; print TMP $L; } if( $L =~ m/entrytext/) { # Entry body start print OUT "STATUS: Publish\n"; open(BTMP, ">>$btmp") || die "Error opening btmp $!"; $entry = 1; } if( $L =~ m//) { # Comment start open(TMP, ">>$tmp") || die "Error opening scratch $!"; $comment = 1; if( $ctog == 0 ) { $ctog = 1; } } } print OUT "--------\n"; close(DATA); } close(OUT);