% ensure everything ends in slash! addpath('./helper/'); workdir = './matfiles/'; heat_savedir = [workdir 'heat_kernels_q/']; exp_savedir = [workdir 'experiments/svm/allnew/']; C1 = {{'M11'}}; % no regex here C2 = {{'.*'}}; %C1 = {{'C11'} {'E11'} {'G154'} {'M11'}}; %C2 = {{'.*'} {'.*'} {'.*'} {'.*'}}; %C2 = {{'C.*'} {'E.*'} {'G.*'} {'M.*'}}; % --- Build CV Pool ----------------------------------------------------------- load([workdir 'topics.mat']); load([workdir 'member.7k.mat']); maxNumPerClass = 2500; numFolds = 40; numTrainPerClass = 150; numTestPerClass = 300; if ~exist(exp_savedir,'dir'), mkdir(exp_savedir); end for ii=1:length(C1) [ids1 ids2 topids1 topids2] = formBinaryPool( ... C1{ii}, C2{ii}, topics, member, maxNumPerClass); c1labels = topics(topids1); c1labels = [c1labels;repmat({'_'},1,length(c1labels)-1) {''}]; c1labels = cell2mat(reshape(c1labels,1,numel(c1labels))); save([exp_savedir c1labels 'vsAll.pool.mat'], ... 'ids1', 'ids2', 'topids1', 'topids2',... 'maxNumPerClass','numFolds','numTrainPerClass','numTestPerClass'); end % --- Perform experiments ----------------------------------------------------- load([workdir 'tf.7k.mat']); load([workdir 'doclen.7k.mat']); cc=[0.50 0.25]; tt=[1.28 1.44 1.60 1.76]; trbfgamma = [1 2 3 4]; rbfgamma = [.5 1 2 3 4]; cut = 2000; K_file = mktemp(['/noback/lowbow/translation/tmp/K.XXXXXX.mtx']); svmflags = [' -t 4 -u "' K_file '" ']; % --- Loop through paramters to test translation kernel ----------------------- filelist = dir(exp_savedir); ptrn = '(.*)vs(.*)\.pool\.mat'; fprintf('CV RBF Kernel under translation experiment.\n'); for fln = {filelist.name} file = char(fln); match = regexp(file,ptrn,'tokens'); % jvd: by re-checking file existence, we can control execution by removing, % but not adding kernels if numel(match)==0 | ~exist([exp_savedir file],'file') continue; end c1labels = char(match{1}(1)); c2labels = char(match{1}(2)); load([exp_savedir file]); ids = [ids1 ids2]; fprintf('%s vs. %s\n',c1labels,c2labels); % --- Standard RBF kernel ------------------------------------------------- fprintf('\tCV standard RBF kernel (with cut=%d):\n',cut); acc = zeros(length(rbfgamma),numFolds); for ii=1:length(rbfgamma) fprintf('\t\trbfgamma = %f\n',rbfgamma(ii)); svmflags = [' -t 2 -g ' num2str(rbfgamma(ii)) ' ']; % --- Cross-Validation ------------------------------------------------ for ff=1:numFolds [trn tst]=sampleFromBinaryPool(maxNumPerClass,numTrainPerClass,numTestPerClass); trn_vects = tf(:,ids(trn.samps)); tst_vects = tf(:,ids(tst.samps)); acc(ii,ff) = svmWrapper( trn_vects, trn.labels, trn.ids,... tst_vects, tst.labels, tst.ids, svmflags, 0 ); end avgacc(ii) = mean(acc(ii,:))/(2*numTestPerClass); stdacc(ii) = std(acc(ii,:))/(2*numTestPerClass); fprintf('\t\t\tmean: %f stddev: %f\n', avgacc(ii), stdacc(ii)); end % --- Translated RBF kernel ------------------------------------------------- fprintf('\tCV translated RBF kernel (with cut=%d):\n',cut); % prepare for feature selection if cut>0 [df I] = sort(full(sum(spones(tf),2))); Jsub = I((end-cut+1):end); % jvd: i.e., the words with highest doc freq get removed Jind = sub2ind([size(tf,1) size(tf,1)],Jsub,Jsub); end tacc = zeros(length(cc),length(tt),length(trbfgamma),numFolds); for ii=1:length(cc) c = cc(ii); for jj=1:length(tt) t = tt(jj); H_file = [heat_savedir 'Hc' num2str(100*c) 't' num2str(100*t) '.7k.mat']; fprintf('\t\t%s\n',H_file); load(H_file); % --- Optional Feature selection (on T) --------------------------- if cut>0 H(Jsub,:)=0;H(:,Jsub)=0;H(Jind)=1; end T = normalizeRows(H); % jvd: make sure normalize last! % --- Prepare for kernel ------------------------------------------ [D Dnull] = transD2tf(tf(:,ids),doclen(ids),T); % rbf kernel, D %D=tf'*T*T'*tf; Dnull = sparse(size(D,1),1); % linear kernel, D Dcache = [[0 Dnull'];[Dnull D]]; % --- RBF Kernel -------------------------------------------------- for kk=1:length(trbfgamma) fprintf('\t\t\ttrbfgamma = %f\n',trbfgamma(kk)); % --- Cross-Validation ---------------------------------------- for ff=1:numFolds [trn tst] = sampleFromBinaryPool(maxNumPerClass,numTrainPerClass,numTestPerClass); % necessary to key into our augmented kernel I = [0 trn.samps tst.samps]+1; K = exp(-trbfgamma(ii) * Dcache(I,I)); % rbf kernel, K %K = Dcache(I,I); % linear kernel, K jvdmatrixWrite( K_file, K ); trn_vects = tf(:,ids(trn.samps)); tst_vects = tf(:,ids(tst.samps)); tacc(ii,jj,kk,ff) = svmWrapper( trn_vects, trn.labels, trn.ids,... tst_vects, tst.labels, tst.ids, svmflags, 0 ); end avgtacc(ii,jj,kk) = mean(tacc(ii,jj,kk,:))/(2*numTestPerClass); stdtacc(ii,jj,kk) = std(tacc(ii,jj,kk,:))/(2*numTestPerClass); fprintf('\t\t\t\tmean: %f stddev: %f\n', avgtacc(ii,jj,kk), stdtacc(ii,jj,kk)); end % checkpoint save! save([exp_savedir c1labels 'vsAll' num2str(cut)/1000 'k.svmrbf.mat'],... 'cut','cc','tt','trbfgamma','tacc','rbfgamma','acc',... 'avgtacc','stdtacc','avgacc','stdacc'); end end save([exp_savedir c1labels 'vsAll' num2str(cut)/1000 'k.svmrbf.mat'],... 'cut','cc','tt','trbfgamma','tacc','rbfgamma','acc',... 'avgtacc','stdtacc','avgacc','stdacc'); end if exist(K_file,'file') system(['rm ' K_file]); end