#!/bin/sh # nice ./preprocess.sh 1>status 2>err & BASE_PATH=/noback/lowbow/reuters/ SEARCH_PATH=$BASE_PATH/corpus/news WORD_FILE=$BASE_PATH/wrd.txt.gz INDEX_FILE=$BASE_PATH/ind.txt.gz VOCAB_FILE=$BASE_PATH/voc.txt.gz FREQ_FILE=$BASE_PATH/freq.txt.gz OUTPUT_PATH=$BASE_PATH/numericFiles/ STOP_WORDS=~/lowbow/reuters/corpus/english.stop.gz TRAINID_FILE=~/lowbow/reuters/corpus/training-itemids.txt.gz #topic_codes.txt.gz # 1) remove xml/html tags. All lowercase. Remove stopwords. Remove # non-alphabetic characters (including numbers). Remove words that have just # one character. perl ./1-clean.pl $SEARCH_PATH $STOP_WORDS $WORD_FILE $INDEX_FILE || exit # 2) build a vocabulary according to stemmed training data. Remove from # vocabulary terms that appear only once. perl ./2-build.pl $TRAINID_FILE $INDEX_FILE $WORD_FILE $VOCAB_FILE $FREQ_FILE || exit # 3) convert train and test files to integer sequences based on the word # indices in the vocabulary. if [[ ! -e $OUTPUT_PATH ]]; then mkdir $OUTPUT_PATH; else rm -fr $OUTPUT_PATH; mkdir $OUTPUT_PATH; fi perl ./3-convert.pl $OUTPUT_PATH $INDEX_FILE $WORD_FILE $VOCAB_FILE || exit # 4) save to /noback/lowbow/numericFiles the files containing the integer # sequence. In addition, save the vocabulary as a vocabulary file. Save also # which words were converted to which stems. Save file containing list of # files of each class. # perl ./4-matlab.pl ...