#!/usr/bin/perl -W use strict; #3) convert train and test files to integer sequences based on the word #indices in the vocabulary. use IO::Zlib; use porter; if( $#ARGV!=3 ) { warn "incorrect number of arguments"; exit(1); } my $output_path = $ARGV[0]; my $ind_file = $ARGV[1]; my $wrd_file = $ARGV[2]; my $voc_file = $ARGV[3]; # jvd: open either compressed output streams or regular txt files #open(FILE,"| gzip >$filename") or die "couldn't open $filename: $!"; local ( *WRD, *IND, *VOC ); if( $wrd_file=~/\.gz$/ ) { tie(*WRD, 'IO::Zlib', $wrd_file, "rb") or die "couldn't open $wrd_file: $!"; } else { open(WRD, "<$wrd_file") or die "couldn't open $wrd_file: $!"; } if( $ind_file=~/\.gz$/ ) { tie(*IND, 'IO::Zlib', $ind_file, "rb") or die "couldn't open $ind_file: $!"; } else { open(IND, "<$ind_file") or die "couldn't open $ind_file: $!"; } if( $voc_file=~/\.gz$/ ) { tie(*VOC, 'IO::Zlib', $voc_file, "wb") or die "couldn't open $voc_file: $!"; } else { open(VOC, "<$voc_file") or die "couldn't open $voc_file: $!"; } # jvd: read vocabulary file my %int_by_stm; print "reading $voc_file... "; my $line = 1; while( ) { chomp; $int_by_stm{$_} = $line; ++$line; } close(VOC); print "finished!\n"; # jvd: assuming lines(IND)==lines(WRD) while( ) { my $itemid; if(/^(.+?)\t/) { $itemid = $1; } else { next; } open(OUT,">$output_path/$itemid") or die "couldn't open $output_path/$itemid: $!"; my $text = ; while( $text=~/(\w+)/g ) { my $word = $1; my $stem = porter($word); #print "$stem $int_by_stm{$stem}\n"; print OUT $int_by_stm{$stem}." " if( exists $int_by_stm{$stem} ); } print OUT "\n"; close(OUT); } print "finished!\n";