addpath('./helper/'); workdir = './matfiles/'; heat_savedir = [workdir 'heat_kernels_q/']; % --- Prep For CV ------------------------------------------------------------- %maxNumCachedPerClass = 4000; %numFolds = 20; %numTrainPerClass = 250; %numTestPerClass = 100; %c1labels = {'M11\d*','M12\d*','M13\d*','M14\d*'}; %load([workdir 'topics.mat']); %load([workdir 'member.7k.mat']); % %% class1 will have label '1' while class2 will have label '-1' %[ids trn_samps tst_samps trn_zs tst_zs] = prepOneVsRest( c1labels, topics, member, ... % maxNumCachedPerClass, numFolds, numTrainPerClass, numTestPerClass ); % %clear topics member; %% save all variables here %save([workdir 'OneVsRest.7k.mat']); load([workdir 'OneVsRest.7k.mat']); % --- Initialization ---------------------------------------------------------- load([workdir 'tf.7k.mat']); tf = tf(:,ids); % prune unused documents (cols) %% remove unused words (rows) %% this is not necessary since it would only affect the transD2 computation, %% which never divides by the doc-freq of words %inactivewords = find(~any(tf,2)); %tf(inactivewords,:) = []; % (rows) % jvd: comment out the next four lines to disable feature selection cut = 2000; [df I] = sort(sum(tf,2)); Jsub = I((end-cut+1):end); % jvd: i.e., the words with highest doc freq Jind = sub2ind([size(tf,1) size(tf,1)],Jsub,Jsub); filelist = dir(heat_savedir); ptrn = 'Hc(\d+)t(\d+)'; % --- Loop through paramters to test translation kernel ----------------------- fprintf('CV Linear Kernel under translation\n'); ii = 0; for fln = {filelist.name} file = char(fln); match = regexp(file,ptrn,'tokens'); % jvd: by re-checking file existence, we can control execution by removing, % but not adding kernels if numel(match)==0 | ~exist([heat_savedir file],'file') continue; end ii = ii+1; fprintf('%s\n',file); c = str2num(match{1}{1})/100; t = str2num(match{1}{2})/100; load([heat_savedir file]); params(:,ii) = [c;t]; % consolidating % --- Optional Feature selection (on T) ----------------------------------- % H(inactivewords,inactivewords) = []; % jvd: not needed, see above % jvd: do not translate the top 2000 words set all rows and cols of words % that occur too frequently to zero, then fixup the diag those rows % to be one. in this way the most occuring words are not % translated. if exist('Jsub','var') & exist('Jind','var') & exist('cut','var') H(Jsub,:)=0;H(:,Jsub)=0;H(Jind)=1; end % jvd: make sure normalize last! T = normalizeRows(H); % jvd: the linear kernel is simply: B = tf' * T*T' * tf; % --- Translated Linear Kernel -------------------------------------------- TAcc = svmCVOneVsRest( trn_samps, tst_samps, trn_zs, tst_zs, tf, ... numFolds, '', 'tranlin', B); tacc(:,ii) = TAcc; end % --- Standard Linear Kernel -------------------------------------------------- fprintf('CV Linear Kernel\n'); acc = svmCVOneVsRest( trn_samps, tst_samps, trn_zs, tst_zs, tf, ... numFolds, '', 'stndlin'); if exist('Jsub','var') & exist('Jind','var') & exist('cut','var') clear_except tacc acc params Jsub Jind workdir; else clear_except tacc acc params workdir; end save([workdir 'acc_svm_lin_feat_q.7k.mat']);