#!/usr/bin/perl -W use strict; #1) remove xml/html tags. All lowercase. Remove stopwords. Remove #non-alphabetic characters (including numbers). Remove words that have #just one character. use File::Find; # jvd: read zips use Archive::Zip qw( :ERROR_CODES :CONSTANTS ); use IO::Zlib; # jvd: write zip stream # jvd: Quick Reference: http://www.xmltwig.com/xmltwig/quick_ref.html # sudo aptitude install libxml-twig-perl use XML::Twig; use Data::Dumper # jvd: convert HTML to plain text require HTML::TreeBuilder; require HTML::FormatText; #use Data::Dumper; # jvd: DEBUG: # jvd: when experimenting with XML::LibXML, had problems setting up SAX, # FIX: # cd /usr/local/share/perl/5.8.7/XML/; sudo mv SAX.pm SAX.pm.old # sudo aptitude install libxml-perl libxml-libxml-perl # --- User Parameters --------------------------------------------------------- # jvd: just hard-coding for now if( $#ARGV!=3 ) { print STDERR "incorrect number of arguments\n"; exit(1); } #my $search_path = 'corpus/news/'; my $search_path = $ARGV[0]; my $stop_file = $ARGV[1]; my $wrd_file = $ARGV[2]; my $ind_file = $ARGV[3]; # --- Main -------------------------------------------------------------------- # jvd: load stop words into hash my %stop_words; if( $stop_file ne '' ) { #open(STOP,'$stop_fn') or die "couldn't open $stop_fn: $!"; tie(*STOP, 'IO::Zlib', $stop_file, "rb") or die "couldn't open $stop_file: $!"; while() { chomp; $stop_words{$_} = 1; # jvd: just defining it as something } } # jvd: instantiate an HTML formatter my $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 79); # jvd: map words to integers starting with $cnt+1 my $cnt = 0; # jvd: the first key used will be ++$cnt my %key_by_wrd; # jvd: open compressed output streams #open(FILE,"| gzip >$filename") or die "couldn't open $filename: $!"; tie(*WRD, 'IO::Zlib', $wrd_file, "wb") or die "couldn't open $wrd_file: $!"; tie(*IND, 'IO::Zlib', $ind_file, "wb") or die "couldn't open $ind_file: $!"; # jvd: this kicks off all subsequent control flow find( {wanted => \&find_action, no_chdir => 1}, $search_path); close(WRD); close(IND); # --- Utility Subroutines ----------------------------------------------------- my ($text_twig_cnt,$topic_twig_cnt); my @topic; my @doc; sub find_action { return unless( /\d+\.zip$/ ); print "processing XML in $_\n"; my $zip = new Archive::Zip($_); my @xml_members; eval{ @xml_members = $zip->membersMatching( '.*\.xml' ) }; if( $@ ) { warn "while reading $_: $@"; next; } foreach my $member ( @xml_members ) { #my $slurp = $zip->contents( $file ); my ( $slurp, $status ) = $member->contents(); unless($status==AZ_OK) { warn "error reading " . $member->fileName() . " of $_\n"; next; } #print Dumper($file); my $t = XML::Twig->new ( # jvd: the twig will include just the root and selected titles, ie # title => sub { $_[1]->print;} twig_roots => { 'newsitem/text' => \&process_text, 'newsitem/metadata/codes' => \&process_codes }, output_filter => 'latin1' ); $topic_twig_cnt = 0; $text_twig_cnt = 0; undef @topic; undef @doc; eval{ $t->parse( $slurp ) }; warn $member->fileName() . " of $_: $@" if $@; if( $topic_twig_cnt==1 && $text_twig_cnt==1 ) { print IND join("\t",@topic) . "\n"; print WRD join(" ",@doc) . "\n"; } else { warn "ERROR: ". $member->fileName() ." of $_: topic_twigs=$topic_twig_cnt, text_twig_cnt=$text_twig_cnt"; #exit; } } #exit; # jvd: DEBUG: } sub process_codes { my( $t, $elt)= @_; return unless(lc $elt->att('class') eq 'bip:topics:1.0'); #print IND $elt->parent->att('itemid') . "\t"; #print IND join("\t",map {$_->att('code')} $elt->descendants('code'))."\n"; @topic = ($elt->parent->att('itemid'),map {$_->att('code')} $elt->descendants('code')); $t->purge; # frees the memory $topic_twig_cnt+=1; return 1; } sub process_text { my( $t, $elt)= @_; my $html = $elt->sprint; my $html_tree = HTML::TreeBuilder->new->parse($html); my $text = lc $formatter->format($html_tree); #my $text = $elt->text; # store the text (including sub-element texts) while( $text=~/(([a-z]('[a-z])?)+)/g ) { my $word = $1; # jvd: NB: the list used has only members of /(([a-z]('[a-z])?)+)/ # hence our regex is ok, and we ignore matched stops words next if( $stop_file ne '' && exists $stop_words{$word} ); # jvd: transliterate non-alphabetics to nothing $word=~tr/a-zA-Z//cd; # jvd: always ignore single letter or NULL words next if( ($word=~tr///c)<=1 ); #print WRD "$word "; push(@doc,$word); } $t->purge; # frees the memory $text_twig_cnt+=1; return 1; }