#!/usr/bin/perl # parseKos.perl - Utility for parsing DailyKos archive pages # Copyright (C) 2006, 2007, 2008 Jon F. Garfunkel # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # http://www.gnu.org/copyleft/gpl.html ########################################### # Topic - for indexing topic information # package Topic; sub new { my($class) = shift; my ($desc) = shift; bless { "description"=> $desc, "KosPosts2002" => 0, "KosPosts2003" => 0, "OtherPosts" => 0, "TotalPosts" => 0, "Comments2002" => 0, "Comments2003" => 0, "Comments" => 0, "WordCount2002" => 0, "WordCountKos2003" => 0, "WordCount2003" => 0, "ZeroLinked" => 0, "Onelinked" => 0, "Manylinked" => 0, "NumLinks" => 0 }, $class; } ############################################# package main; use CSV; use Switch; ############################################## # Basic stuff # Who's running the program? $MyName = "Jon Garfunkel"; # Who's our daddy? $urlbase = "http://www.dailykos.net/archives"; # Where's our stash? $sourcepath = "pages"; ############################################### # These are the 16 topics we'll be looking for... @topicnames = ("iraq","bushies","primary","congress","states","calif","dems","economy","domestic", "gwot","world","media","misc","democracy","site","open"); %topics = (); $topics{"bushies"} = Topic->new("Bush administration"); $topics{"iraq"} = Topic->new("Iraq war"); $topics{"gwot"} = Topic->new("Afghanistan, 9/11, war on terror"); $topics{"world"} = Topic->new("World affairs (exluding wars)"); $topics{"domestic"} = Topic->new("Domestic Affairs"); $topics{"economy"} = Topic->new("Economy"); $topics{"dems"} = Topic->new("Democratic Party"); $topics{"democracy"} = Topic->new("Democracy: Voting, Free Speech, etc."); $topics{"congress"} = Topic->new("Congressional politics"); $topics{"states"} = Topic->new("State/local politics"); $topics{"calif"} = Topic->new("California politics"); $topics{"primary"} = Topic->new("2004 Democratic Presidential Primary"); $topics{"media"} = Topic->new("Media"); $topics{"misc"} = Topic->new("Baseball, etc."); $topics{"site"} = Topic->new("Site issues and announcements"); $topics{"open"} = Topic->new("Open Thread"); ############################################### # Initialize some files # Let's write out an Atom/RSS feed $rdf = "DailyKos.atom"; open (RDF,"> $rdf"); print RDF "\n"; print RDF "DailyKos archive May 26, 2002 - October 14, 2003\n"; # Also, a CSV is nice to take home $csvfile = "DailyKos.csv"; open (CSV,"> $csvfile"); CSVinit(); $csvline = CSVjoin("id", "Author", "Date", "Category", "Title", "Wordcount", "Comments"); print CSV "$csvline\n";; # And an index ByMonth $monthfile = "ByMonth.html"; open (BYMONTH, "> $monthfile"); select(BYMONTH); print_header("By Month"); print "Posts in each topic"; print "posts with # hyperlinks"; print "Month"; foreach (@topicnames) { $t = $topics{$_}; $desc = $t->{'description'}; print "$_"; } print "comments"; print "zero"; print "one"; print "many"; print "total posts"; print "\n"; ####### Also, one page for each topic foreach (@topicnames) { $f = "kos_" . $_ . ".html"; open (TOPIC, ">$f"); select TOPIC; print_header($topics{$_}->{'description'}); print "DateTitleAuthorwordcount"; print "commentsLinksIn. Links"; close TOPIC; } #################################### # Initialize some vars for monthly summary $lastMonth = ""; $lastMonthLabel = ""; $MonthRowCount = 0; $lastDay = ""; %TopicsMonth = (); $MonthComments = 0; %TopicsCount = (); %Authors = (); $NumLinks = 0; $ZeroLinked = 0; $OneLinked = 0; $ManyLinked = 0; open (MONTH, ">/dev/null"); ###################################### # # Let's loop on through # # $start = 1; $end = 4571; print STDERR "parsing 4571 articles...\n"; for ($i = $start; $i< $end; $i += 1) { print STDERR "." if ($i % 100 == 0); $id = sprintf("%06d",$i); $file = sprintf("%06d.html", $i); $url = "$urlbase/$file"; # open file $file = "$sourcepath/$file"; # print STDERR "opening $file"; next if (!-e $file); open (FILE, "< $file"); # initialize variables $numposts = 0; $date = ""; $author = "kos"; $subject = ""; $comments = 0; $wordcount = 0; @urls = (); @links = (); $internalLinks = 0; $flag = ""; $line=0; # the very first post is about Kos. The rest we have to figure out $subject = "site" if $i == 1; $state = 0; while () { switch ($state) { ##################### case 0 { # first we find a title. We can use but let's take the one from the body if (m|<span class="title">(.+)$|) { $title = $1; # clean it up $cutoff = rindex($title, "<"); $title = substr($title, 0, $cutoff); # We'll guess the subject from the title $subject = guess_subject($title); $state = 1; $line = 0; } # break; } #######################3 case 1 { # We're now in the body of the post # There must be an author here somewhere. Bueller? Anybody? $a = guess_author($_); $author = $a if ($a ne ""); # Let's look at the paragraphs and the occasional italicized line if ((m/^<p>/i) || (m/^<i>/)) { if ($subject eq "") { # We're going to keep quessing $subject = guess_subject($_); # and as we do, we can give a hint to the output as to where we found the keyword $line++; $flag = "*" . $line if $subject ne ""; } } # Take another guess at the date. Some of the content includes rdf:dc # Most don't. $d = guess_date($_); if ($d ne "") { $date = $d; $date =~ s/\s+$//; # and then the comments are here if (m|Comments</a> \((\d+)\)|) { $comments = $1; # $internalLinks--; # because this is a link $state = 3; next; } # otherwise, maybe comments are in the next line $state = 2; } # find some hyperlinks if (m|href="([^"]+)">([^<]+)</a>|) { push (@links, "<a class='extlink' href='$1'>$2</a>"); } @newlinks = m|href="http://([^/"]+)[/"]|gi; foreach $nl (@newlinks) { push(@urls,$nl); $internalLinks++ if ($nl =~ /dailykos/); $urlcount[$nl]++; } # take a rough count of the words @words = split(/\s/, $_); $count = @words; $wordcount += $count if ($date eq ""); } ########################### case 2 { if (m|Comments</a> \((\d+)\)|) { $comments = $1; $state = 3; } } # default: break; } } # we done with this file close FILE; $wordcount -= 2 if ($wordcount > 1); next if $title eq "Test"; ################################# # Now interpret the data # Find a date, loser ($Month, $Day, $Year, $Time, $M) = split(/ /,$date); chop($Day); $YM = $Year . "_" . $Month; $MonthFile = "kos_$YM.html"; $Mon = substr($Month, 0, 3); # One last adjustment to the subject if ($subject eq "election") { $subject = ($Year eq "2003") ? "primary" : "congress"; } $title = "untitled" if $title eq ""; $title =~ s/"/'/g; if (($Year == 2002) && ($title =~ /Gephardt/)) { $subject = "iraq"; # When Gephardt sold the Dems out } $subject = "misc" if $subject eq ""; ########################## # Adjustments # make some adjustments to the author $author = "kos" if $i==4101; # correcting for mis-identification $author = "Steve Gilliard" if $i==2240; $author = "Steve Soto" if $i==1021; $author = "Steve Soto" if $i==1078; $author = "Billmon" if $i==801; $author = "Billmon" if $i==952; $author = "Billmon" if $i==1054; $author = "Billmon" if $i==1808; $author = "Billmon" if $i==1845; $author = "kos" if $i==1915; $author = "RonK" if $i==3922; $author = "RonK" if $i==4106; $author = "RonK" if $i==4135; $author = "RonK" if $i==4145; # several of the "Open Thread" posts were substantive; most site issues $subject = "congress" if $i==655; $subject = "iraq" if $i==1460; $subject = "iraq" if $i==2240; $subject = "iraq" if $i==4219; $subject = "site" if $i==481; # reveals name to be Markos $subject = "site" if $i==596; $subject = "site" if $i==1168; $subject = "site" if $i==2022; $subject = "site" if $i==2308; $subject = "site" if $i==2330; $subject = "site" if $i==3148; $subject = "site" if $i==3331; $subject = "site" if $i==3407; $subject = "site" if $i==3677; $subject = "site" if $i==3799; $subject = "site" if $i==3910; $subject = "misc" if $i==4232; $subject = "misc" if $i==4331; $subject = "bushies" if $i==25; $subject = "media" if $i==634; # "polls" are often wrong... $subject = "primary" if $i==2231; $subject = "primary" if $i==3822; $subject = "primary" if $i==4153; $subject = "states" if $i==4334; $subject = "congress" if $i==172; $subject = "congress" if $i==343; $subject = "states" if $i==452; $subject = "states" if $i==456; $subject = "states" if $i==484; ################################### # Increment our counters $TopicsCount{$subject}++; $TopicsMonth{$subject}++; $Authors{$author}++; $MonthComments += $comments; $PostsThisMonth++; $t = $topics{$subject}; if ($Year eq "2002") { $t->{'KosPosts2002'}++; $t->{'Comments2002'} += $comments; $t->{'WordCount2002'} += $wordcount; } else { if ($author eq "kos") { $t->{'KosPosts2003'}++; $t->{'WordCountKos2003'} += $wordcount; } else { $t->{'OtherPosts'}++; $t->{'WordCount2003'} += $wordcount; } $t->{'Comments2003'} += $comments; } $numlinks = @links; $t->{'NumLinks'} += $numlinks; if ($numlinks == 0) { $t->{'ZeroLinked'}++; $ZeroLinked++; } elsif ($numlinks == 1) { $t->{'OneLinked'}++; $OneLinked++; } else { $t->{'ManyLinked'}++; $ManyLinked++; } # for debugging - $flag indicates with a # if the keyword was found in the paragraphs # print STDERR "$id $date s:$subject$flag $title -by $author w:$wordcount\tc:$comments\n"; # write out CSV and RDF out... $csvline = CSVjoin($id, $author, $date, $subject, $title, $wordcount, $comments); print CSV "$csvline\n";; print RDF "<entry>\n\t<link rel='alternate' type='text/html' href='$url'/>\n"; print RDF "\t<title>$title\n\t$author"; print RDF "\n\t$subject\n\t$date\n"; print RDF "\n"; # now write some neat HTML files. Ideally, we ought to process the RDF instead. # We'll do one file per month, for starters. # As well as one per topic $title = substr($title,0,84); open (TOPIC, ">> kos_$subject.html"); $TopicRowClass = ($TopicsCount{$subject} % 2 == 1) ? "oddrow" : ""; $allLinks = @urls; select(TOPIC); print ""; print "$Year $Mon $Day $Time $M "; print "$title\n"; print " $author$wordcount"; print "$comments"; print "$allLinks$internalLinks"; print "\n"; close TOPIC; # we could print out the links, too. Another time. # print TOPIC "