function [pred_store] = bayes_mars_class(data,test,q,costs,interaction,orders,SUR,LAP) % This is a simple version of reversible jump mcmc (Green, 1995) to perform nonparametric % Bayesian classification using MARS basis functions with unknown numbers of basis functions. % It can use either a Laplacian (double exponential) or Normal prior on the coefficients. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % % INPUTS: % % data - training data: first column is integer Y \in {1,..,q) denoting class label; remaining columns are % covariates, X % test - test data: same format as 'data' % q - number of categories/classes, e.g q=2 is binary/logistic regression % costs - misclassification cost: e.g. costs = [1 1 2], denotes a three class classification problem... % where the cost of misclassifying category three is twice that of categories one or two. % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % % OPTIONAL INPUTS - these have default settings but can be altered if you wish % % interaction - level of interaction in each basis function is uniform on {1,2,...interaction} % Default interaction=2. Note that interaction=1 produces an additive model % orders - set of allowable orders for the spline basis. e.g orders=[0 1] allows for both step % functions and linear splines, orders=[0] only allows step functions, etc % Default orders=[1], i.e. linear splines. % SUR - SUR \in {0,1} this is an indicator for Seemingly Unrelated Regressions, SUR=1: allocates each basis % function to one and only one class; SUR=0: means each basis function has (q-1) coefficients % associated with it. Note that SUR is redundant for two-class (binary) problems % Default: SUR=1; % LAP - {0,1} indicator for Laplacian (double exponential) prior on the coefficients or Gaussian % Defualt: LAP=1; use a Laplacian. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % % OUTPUTS: - only very limited at present but you can add your own % % pred_store - mean prediction on the test set % % Originally written by Chris Holmes and medified by Veera % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % % if nargin <=4 interaction=2; orders=[1]; SUR=1; LAP=1; end if q==2 SUR=0; end % The next lines are also our recomended default settings: STANDARDISE=1; % see below k_max = 500; % see below jump_std=0.1; % see below mcmc_its = 200000; % see below burn_in = 20000; % see below % STANDARDISE - \in {0,1}: indicates whether to standardise data set before calculating basis functions % I RECOMMEND YOU SET STANDARDISE=1 % jump_std - the variance of the update proposal kernel on the basis coefficients. This is % a user set parameter that needs tuning. Our recommendation is to set this value % to insure acceptance rates of around 20% % mcmc_its - the number of mcmc iterations; needs to be at least 1000, but the higher the better. % burn_in - the number of burn in iterations; should be at least 1000, but ALLWAYS CHECK THE MODEL % STATISTICS TO ENSURE CONVERGENCE OF THE MCMC CHAIN!!!!!!!!!!!!!! % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Now for the program % response should be stored in the first column of the data and test % it will be stored in Y Y = data(:,1); Yt = test(:,1); % convert classes to 1,...,q (rather than 0,...,q-1) if necessay if min(Y)==0 Y=Y+1; Yt=Yt+1; end % calculate misclassification cost matrices if all(costs==costs(1)) % then equal costs COSTS=0; else COSTS=1; % set up weighting matrices for predictions.... % convert costs to column vector if size(costs,2)>size(costs,1) costs=costs'; end costy = costs(Yt); cost_mat = diag(costs); % we will use these when calculating test misclassification errors end if q~=max(Y) % something is wrong error('Q does not match max of Y!'); end % store covariate/input/feature/predictor/explanatory (what ever you want to call them) variables % in X and Xt X = data(:,2:size(data,2)); Xt = test(:,2:size(test,2)); % get dimensions of data n = size(data,1); nt = size(test,1); p = size(X,2); % make some space clear data; clear test; % integrety check if p~=size(Xt,2) error('Different number of predictors in training and test sets'); end if STANDARDISE % then standardise data for i=1:p mx = mean(X(:,i)); sx = std(X(:,i)); X(:,i) = (X(:,i)-mx) / sx; Xt(:,i) = (Xt(:,i)-mx) / sx; end end % t is indicator of target category: t(i,j)=1 if Y(i)==j; t(i,j)=0 otherwise % same for t_test t = zeros(n,q); t_test = zeros(nt,q); for j=1:q fi = find(Y==j); t(fi,j)=1; fi = find(Yt==j); t_test(fi,j)=1; end % The next is necessary for numerical stability % Ensure that log(1-y) is computable: need exp(a) > eps maxcut = log(realmax) - log(q); % Ensure that exp(a) > 0 mincut = log(realmin); % Now lets begin with an intercept term..... % first fit best intercept model to data theta_hat = zeros(1,q); for j=1:q theta_hat(j) = sum(Y==j) / n; end % pred stores prediction pred = ones(n,1)*theta_hat; % eta stores linear predictor, That is: % pred(i,j) = pr(y_i==C_j) = exp(eta(i,j)) / sum_{j=1}^q exp(eta(i,j)) % where C_j is the jth class and eta(i,j) is known as the "linear predictor" as % it is formed by a linear combination of outputs from basis functions. % % hence....for our current intercept model. eta = zeros(n,q-1); % initialised for j=2:q eta(:,j-1) = log(theta_hat(j) / theta_hat(1)); end % Note that we only need (q-1) columns in eta as is common in GLM theory .... % we implicitly set the first basis coefficients to zero. % The interpretation of eta(i,j) is the "log odds ratio of class (j+1) to class 1" % ...now same for test linear predictor eta_test = zeros(nt,q-1); % set eta_test to same as eta (as we've only fitted constants at present) for j=1:q-1 eta_test(:,j)=eta(1,j); end % make first basis design: X_mars will store the n by k+1 matrix of outputs from the % k MARS basis functions (and one intercept) evaluated at the n data points. Xt_mars will store % the same for the test set. X_mars = zeros(n,k_max); X_mars(:,1)=1; % first column is intercept. Xt_mars = zeros(nt,k_max); Xt_mars(:,1)=1; % beta will store MARS basis coefficients % set up initially with just intercept term % NOTE THAT BETA IS OF DIMENSION (q-1) by 1, as the first beta is zero. beta(1,:) = eta(1,:); % as we've already calculated the optimal intecept. k = 1; % k stores number of basis functions: k=1 as we start with single intercept % % TO RECAP: % % the model with k basis functions is as follows: % % pr(y_i == C_j | x_i) = exp{ eta(i,j) } / [ sum_{v=1}^q exp{ eta(i,v) } ] % % where, % % eta(i,1) = 0 for all i % % eta(i,j) = X_mars(i,1:k) * beta(1:k,j-1) for j=2,..,q % % beta ~ iid from prior pi % % % get log likelihood of intercept model % first make sure eta is in valid range (due to numerical rounding). % .....these next few lines are copied from NetLab - the excellent package from Aston University a = min(eta, maxcut); a = max(eta, mincut); temp = [ones(n,1) exp(a)]; y = temp./(sum(temp, 2)*ones(1,q)); % Ensure that log(y) is computable y(y1 % swap a basis function for another one move =1; flag=3; else % just update coefficients (no change to basis functions) update=1; flag=4; end % store which move we are going to attempt prop(flag)=prop(flag)+1; % now depending on move type update the model if birth k_prop=k+1; % we have one more basis function % generate a basis function from the prior [X_prop knot_p var_p lr_p ord_p inter_p]=gen_mars_basis(X,interaction,orders); if SUR % set all but one beta to zero beta_prop(k_prop,:)=zeros(1,q-1); % initialise new beta (coefficients) indxb = ceil(rand*(q-1)); % choose a category for this basis function if LAP % Laplacian prior beta_prop(k_prop,indxb) = ((2*(rand>0.5))-1)*exprnd(1/prec); % draw the coefficient from the prior else % Normal beta_prop(k_prop,indxb) = (1/sqrt(prec))*randn; % draw the coefficient from the prior end % next two lines store an indicator of which category is associated with this basis function sur_indx_prop(k_prop,:)=zeros(1,q-1); sur_indx_prop(k_prop,indxb)=1; else % not SUR, hence all beta's are non-zero and again drawn from the prior if LAP beta_prop(k_prop,:) = ((2*(rand(1,q-1)>0.5))-1).*exprnd(1/prec,1,q-1); else beta_prop(k_prop,:) = (1/sqrt(prec))*randn(1,q-1); % draw the coefficient from the prior end end % Now calculate change to the linear predictor: % % eta_prop stores the new linear predictor....recall that % pr(y_i==j) = exp(eta(i,j)) / sum_{v=1}^q exp(eta(i,v)) % % now, as we've only changed one basis function we can alter the current % value of eta as follows eta_prop = eta + X_prop*beta_prop(k_prop,:); % due to colinearity in the basis set we should also propose to % update some other coefs as well. (if the basis space was % orthogonal this would not be necessary). % % we will select some (say 10) bases at random indx = randperm(k); upd = min(k,10); % make sure we don't select more than we have indx = indx(1:upd); beta_change = jump_std*randn(upd,q-1); %propose a change to some beta's if SUR beta_change = beta_change.*sur_indx_prop(indx,:); % set all but one class to zero end beta_prop(indx,:) = beta_prop(indx,:) + beta_change; % change the beta's % now update linear predictor eta_prop = eta_prop + X_mars(:,indx)*beta_change; % calculate prior of the new coeffs (clearly we using logs here) if LAP prior_pen = -prec*(sum(sum(abs(beta_prop(indx,:)))) - sum(sum(abs(beta(indx,:))))); else prior_pen = -0.5*prec*(sum(sum(beta_prop(indx,:).^2)) - sum(sum(beta(indx,:).^2))); end % we're assuming a flat prior on the intercept so add this back on if necessary if ismember(1,indx) if LAP prior_pen = prior_pen +prec*(sum(sum(abs(beta_prop(1,:)))) - sum(sum(abs(beta(1,:))))); else prior_pen = prior_pen +0.5*prec*(sum(beta_prop(1,:).^2) - sum(beta(1,:).^2)); end end % end of birth move step elseif death k_prop=k-1; % we've lost a basis function % choose a basis from the model to delete, NOT THE INTERCEPT THOUGH change_indx = ceil(rand*(k-1))+1; % remove it's contribution from the linear predictor eta_prop = eta - X_mars(:,change_indx)*beta(change_indx,:); % update other coefs but not the one just deleted indx = randperm(k_prop); upd = min(k_prop,10); list_of_bases = [1:(change_indx-1) change_indx+1:k]; indx = list_of_bases(indx(1:upd)); beta_change = jump_std*randn(upd,q-1); %propose update if SUR beta_change = beta_change.*sur_indx_prop(indx,:); % set all but one class to zero end beta_prop(indx,:) = beta_prop(indx,:) + beta_change; % make the change % update linear predictor eta_prop = eta_prop + X_mars(:,indx)*beta_change; % calculate prior of the new coeffs (clearly we using logs here) if LAP prior_pen = -prec*(sum(sum(abs(beta_prop(indx,:)))) - sum(sum(abs(beta(indx,:))))); else prior_pen = -0.5*prec*(sum(sum(beta_prop(indx,:).^2)) - sum(sum(beta(indx,:).^2))); end % we're assuming a flat prior on the intercept so add this back on if necessary if ismember(1,indx) if LAP prior_pen = prior_pen +prec*(sum(sum(abs(beta_prop(1,:)))) - sum(sum(abs(beta(1,:))))); else prior_pen = prior_pen +0.5*prec*(sum(beta_prop(1,:).^2) - sum(beta(1,:).^2)); end end % end of death step elseif move % this step changes one basis function % choose a basis from the model to swap with another, not intercept though change_indx = ceil(rand*(k-1))+1; % make change to the linear predictor by removing this basis function eta_prop = eta - X_mars(:,change_indx)*beta(change_indx,:); % now generate a new basis function from the prior [X_prop knot_p var_p lr_p ord_p inter_p]=gen_mars_basis(X,interaction,orders); % change coeff beta_prop(change_indx,:) = beta_prop(change_indx,:) + jump_std*randn(1,q-1); if SUR % set all but one class to zero if SUR beta_prop(change_indx,:) = beta_prop(change_indx,:).*sur_indx_prop(change_indx,:); end % make change to the linear predictor eta_prop = eta_prop + X_prop*beta_prop(change_indx,:); % get change to log prior due to new coeffs if LAP prior_pen = -prec*(sum(sum(abs(beta_prop(change_indx,:)))) - sum(sum(abs(beta(change_indx,:))))); else prior_pen = -0.5*prec*(sum(sum(beta_prop(change_indx,:).^2)) - sum(sum(beta(change_indx,:).^2))); end % update some other coefs (not the one just changed) indx = randperm(k_prop-1); upd = min(k_prop-1,10); list_of_bases = [1:(change_indx-1) change_indx+1:k]; indx = list_of_bases(indx(1:upd)); % note changes to beta's beta_change = jump_std*randn(upd,q-1); if SUR beta_change = beta_change.*sur_indx_prop(indx,:); %set coeffs on all but one class to zero end beta_prop(indx,:) = beta_prop(indx,:) + beta_change; % associated change to the linear predictor eta_prop = eta_prop + X_mars(:,indx)*beta_change; % calculate prior of the new coeffs (clearly we using logs here) if LAP prior_pen = prior_pen-prec*(sum(sum(abs(beta_prop(indx,:)))) - sum(sum(abs(beta(indx,:))))); else prior_pen = prior_pen-0.5*prec*(sum(sum(beta_prop(indx,:).^2)) - sum(sum(beta(indx,:).^2))); end % we're assuming a flat prior on the intercept so add this back on if necessary if ismember(1,indx) if LAP prior_pen = prior_pen +prec*(sum(sum(abs(beta_prop(1,:)))) - sum(sum(abs(beta(1,:))))); else prior_pen = prior_pen +0.5*prec*(sum(beta_prop(1,:).^2) - sum(beta(1,:).^2)); end end % end of move step else % just updating coefficients (not change to basis design matrix X_mars(:,1:k) ) % choose some coeffs to update indx = randperm(k_prop); upd = min(k_prop,10); indx = indx(1:upd); % make changes to the beta's beta_change = jump_std*randn(upd,q-1); if SUR beta_change = beta_change.*sur_indx_prop(indx,:); end beta_prop(indx,:) = beta_prop(indx,:) + beta_change; % associated change to the linear predictor eta_prop = eta_prop + X_mars(:,indx)*beta_change; % calculate prior of the new coeffs (clearly we using logs here) if LAP prior_pen = -prec*(sum(sum(abs(beta_prop(indx,:)))) - sum(sum(abs(beta(indx,:))))); else prior_pen = -0.5*prec*(sum(sum(beta_prop(indx,:).^2)) - sum(sum(beta(indx,:).^2))); end % we're assuming a flat prior on the intercept so add this back on if necessary if ismember(1,indx) if LAP prior_pen = prior_pen +prec*(sum(sum(abs(beta_prop(1,:)))) - sum(sum(abs(beta(1,:))))); else prior_pen = prior_pen +0.5*prec*(sum(beta_prop(1,:).^2) - sum(beta(1,:).^2)); end end end % this ends the update proposals % Now get the log likelihood of proposed model a = eta_prop; % numerical checking (thanks to NetLab) a = min(a, maxcut); a = max(a, mincut); temp = [ones(n,1) exp(a)]; y = temp./(sum(temp, 2)*ones(1,q)); % Ensure that log(y) is computable y(y 200 & k>1 if LAP % prior on beta is: beta ~ Lap(0, prec^{-1}) beta_val = sum(sum(abs(beta(2:k,:)))); if SUR prec = (1/beta_val)*randgamma_mat((k-1),1,1); else prec = (1/beta_val).*randgamma_mat((k-1)*(q-1),1,1); end else % prior on beta is : beta ~ N(0, prec^{-1} I) beta_val = sum(sum(beta(2:k,:).^2)); if SUR prec = (1./(0.5*beta_val)).*randgamma_mat(0.5*(k-1),1,1); else prec = (1./(0.5*beta_val)).*randgamma_mat(0.5*(k-1)*(q-1),1,1); end end % Laplace or Normal prior end % routine to update precision % Next section records model after burn in if count>burn_in % start collecting samples sample=sample+1; a=eta_test; % numerical checks.... a = min(a, maxcut); a = max(a, mincut); temp = [ones(nt,1) exp(a)]; y = temp./(sum(temp, 2)*ones(1,q)); % Ensure that log(y) is computable y(y