#!/usr/bin/perl -W use strict; # 2) build a vocabulary according to stemmed training data. Remove from # vocabulary terms that appear only once. use IO::Zlib; use porter; # jvd: stem if( $#ARGV!=4 ) { warn "incorrect number of arguments"; exit(1); } my $trn_file = $ARGV[0]; my $ind_file = $ARGV[1]; my $wrd_file = $ARGV[2]; # jvd: will contain a hash of root words with value==freq my $voc_file = $ARGV[3]; my $freq_file = $ARGV[4]; # jvd: open either compressed output streams or regular txt files #open(FILE,"| gzip >$filename") or die "couldn't open $filename: $!"; local ( *WRD, *IND, *VOC, *FREQ ); if( $wrd_file=~/\.gz$/ ) { tie(*WRD, 'IO::Zlib', $wrd_file, "rb") or die "couldn't open $wrd_file: $!"; } else { open(WRD, "<$wrd_file") or die "couldn't open $wrd_file: $!"; } if( $ind_file=~/\.gz$/ ) { tie(*IND, 'IO::Zlib', $ind_file, "rb") or die "couldn't open $ind_file: $!"; } else { open(IND, "<$ind_file") or die "couldn't open $ind_file: $!"; } if( $voc_file=~/\.gz$/ ) { tie(*VOC, 'IO::Zlib', $voc_file, "wb") or die "couldn't open $voc_file: $!"; } else { open(VOC, ">$voc_file") or die "couldn't open $voc_file: $!"; } if( $freq_file=~/\.gz$/ ) { tie(*FREQ, 'IO::Zlib', $freq_file, "wb") or die "couldn't open $freq_file: $!"; } else { open(FREQ, ">$freq_file") or die "couldn't open $freq_file: $!"; } my %istrain_by_id; if( $trn_file ne '""' ) { #tie(*TRN, 'IO::Zlib', $trn_file, "rb") or die "couldn't open $trn_file: $!"; print "reading $trn_file... "; while( ) { next unless($_=~/(\d+)/); my $itemid = $1; $istrain_by_id{$itemid} = $itemid; } close(TRN); print "finished!\n"; } my %voc_by_stm; my $number = 0; # => first word begins with 1 # jvd: assuming lines(IND)==lines(WRD) print "constructing vocabulary... "; while( ) { my $itemid = ''; if(/^(.+?)\t/) { $itemid = $1; } else { next; } print "$itemid\n"; my $text = ; # jvd: TODO: this needs to be reactivated after debugging next if( %istrain_by_id && !exists($istrain_by_id{$itemid}) ); while( $text=~/(\w+)/g ) { my $word = $1; # jvd: always ignore single letter or NULL words # jvd: TODO: this needs to be reactivated after debugging next if( ($word=~tr///c)<=1 ); my $stem = porter($word); # jvd: is this a new stem? unless( exists $voc_by_stm{$stem} ) { # jvd: yes, so set count to 1 and loop $voc_by_stm{$stem} = { $word=>1 }; next; } # jvd: not a new stem, so see if we've seen this root word unless( exists ${$voc_by_stm{$stem}}{$word} ) { ${$voc_by_stm{$stem}}{$word}=0; } # jvd: root word \in HASH => increment ++${$voc_by_stm{$stem}}{$word}; } } close(WRD); close(IND); print "finished!\n"; print "saving vocabulary... "; foreach (sort keys %voc_by_stm) { my $stem = $_; my %hash = %{$voc_by_stm{$_}}; my @words = sort keys %hash; my $count = 0; $count+=$_ foreach( values %hash ); # jvd: TODO: this needs to be reactivated after debugging next if( $count<=1 ); #print $stem . "\t" . join("\t",@{$voc_by_stm{$stem}}) . "\n"; #print VOC $stem . "\t" . join("\t",@{$voc_by_stm{$stem}}) . "\n"; print VOC "$stem\n"; print FREQ "$stem\t$count\t". join("\t",map {$_."(".$hash{$_}.")"} @words)."\n"; } close(VOC); close(FREQ); print "finished!\n";