som_norm_variable.m | searchcode

/code_oth/som_norm_variable.m

http://research-code-base-animesh.googlecode.com/
MATLAB | 533 lines | 240 code | 37 blank | 256 comment | 38 complexity | 680db73cc1cdf69de85f4bd5b4dcb75a MD5 | raw file

function [x,sNorm] = som_norm_variable(x, method, operation)



%SOM_NORM_VARIABLE Normalize or denormalize a scalar variable.

%

% [x,sNorm] = som_norm_variable(x, method, operation)

%

%   xnew = som_norm_variable(x,'var','do');

%   [dummy,sN] = som_norm_variable(x,'log','init');

%   [xnew,sN]  = som_norm_variable(x,sN,'do');

%   xorig      = som_norm_variable(xnew,sN,'undo');

%

%  Input and output arguments: 

%   x         (vector) a set of values of a scalar variable for

%                      which the (de)normalization is performed.

%                      The processed values are returned.

%   method    (string) identifier for a normalization method: 'var',

%                      'range', 'log', 'logistic', 'histD', or 'histC'.

%                      A normalization struct with default values is created.

%             (struct) normalization struct, or an array of such

%             (cellstr) first string gives normalization operation, and the

%                      second gives denormalization operation, with x 

%                      representing the variable, for example: 

%                      {'x+2','x-2}, or {'exp(-x)','-log(x)'} or {'round(x)'}.

%                      Note that in the last case, no denorm operation is 

%                      defined. 

%   operation (string) the operation to be performed: 'init', 'do' or 'undo'

%                     

%   sNorm     (struct) updated normalization struct/struct array

%

% For more help, try 'type som_norm_variable' or check out online documentation.

% See also SOM_NORMALIZE, SOM_DENORMALIZE.



%%%%%%%%%%%%% DETAILED DESCRIPTION %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%

% som_norm_variable

%

% PURPOSE

%

% Initialize, apply and undo normalizations on a given vector of

% scalar values.

%

% SYNTAX

%

%  xnew = som_norm_variable(x,method,operation)

%  xnew = som_norm_variable(x,sNorm,operation)

%  [xnew,sNorm] = som_norm_variable(...)

%

% DESCRIPTION

%

% This function is used to initialize, apply and undo normalizations

% on scalar variables. It is the low-level function that upper-level

% functions SOM_NORMALIZE and SOM_DENORMALIZE utilize to actually (un)do

% the normalizations.

%

% Normalizations are typically performed to control the variance of 

% vector components. If some vector components have variance which is

% significantly higher than the variance of other components, those

% components will dominate the map organization. Normalization of 

% the variance of vector components (method 'var') is used to prevent 

% that. In addition to variance normalization, other methods have

% been implemented as well (see list below). 

%

% Usually normalizations convert the variable values so that they no 

% longer make any sense: the values are still ordered, but their range 

% may have changed so radically that interpreting the numbers in the 

% original context is very hard. For this reason all implemented methods

% are (more or less) revertible. The normalizations are monotonic

% and information is saved so that they can be undone. Also, the saved

% information makes it possible to apply the EXACTLY SAME normalization

% to another set of values. The normalization information is determined

% with 'init' operation, while 'do' and 'undo' operations are used to

% apply or revert the normalization. 

%

% The normalization information is saved in a normalization struct, 

% which is returned as the second argument of this function. Note that 

% normalization operations may be stacked. In this case, normalization 

% structs are positioned in a struct array. When applied, the array is 

% gone through from start to end, and when undone, in reverse order.

%

%    method  description

%

%    'var'   Variance normalization. A linear transformation which 

%            scales the values such that their variance=1. This is

%            convenient way to use Mahalanobis distance measure without

%            actually changing the distance calculation procedure.

%

%    'range' Normalization of range of values. A linear transformation

%            which scales the values between [0,1]. 

%

%    'log'   Logarithmic normalization. In many cases the values of

%            a vector component are exponentially distributed. This 

%            normalization is a good way to get more resolution to

%            (the low end of) that vector component. What this 

%            actually does is a non-linear transformation: 

%               x_new = log(x_old - m + 1) 

%            where m=min(x_old) and log is the natural logarithm. 

%            Applying the transformation to a value which is lower 

%            than m-1 will give problems, as the result is then complex.

%            If the minimum for values is known a priori, 

%            it might be a good idea to initialize the normalization with

%              [dummy,sN] = som_norm_variable(minimum,'log','init');

%            and normalize only after this: 

%              x_new = som_norm_variable(x,sN,'do');

%

%    'logistic' or softmax normalization. This normalization ensures

%            that all values in the future, too, are within the range

%            [0,1]. The transformation is more-or-less linear in the 

%            middle range (around mean value), and has a smooth 

%            nonlinearity at both ends which ensures that all values

%            are within the range. The data is first scaled as in 

%            variance normalization: 

%               x_scaled = (x_old - mean(x_old))/std(x_old)

%            and then transformed with the logistic function

%               x_new = 1/(1+exp(-x_scaled))

% 

%    'histD' Discrete histogram equalization. Non-linear. Orders the 

%            values and replaces each value by its ordinal number. 

%            Finally, scales the values such that they are between [0,1].

%            Useful for both discrete and continuous variables, but as 

%            the saved normalization information consists of all 

%            unique values of the initialization data set, it may use

%            considerable amounts of memory. If the variable can get

%            more than a few values (say, 20), it might be better to

%            use 'histC' method below. Another important note is that

%            this method is not exactly revertible if it is applied

%            to values which are not part of the original value set.

%            

%    'histC' Continuous histogram equalization. Actually, a partially

%            linear transformation which tries to do something like 

%            histogram equalization. The value range is divided to 

%            a number of bins such that the number of values in each

%            bin is (almost) the same. The values are transformed 

%            linearly in each bin. For example, values in bin number 3

%            are scaled between [3,4[. Finally, all values are scaled

%            between [0,1]. The number of bins is the square root

%            of the number of unique values in the initialization set,

%            rounded up. The resulting histogram equalization is not

%            as good as the one that 'histD' makes, but the benefit

%            is that it is exactly revertible - even outside the 

%            original value range (although the results may be funny).

%

%    'eval'  With this method, freeform normalization operations can be 

%            specified. The parameter field contains strings to be 

%            evaluated with 'eval' function, with variable name 'x'

%            representing the variable itself. The first string is 

%            the normalization operation, and the second is a 

%            denormalization operation. If the denormalization operation

%            is empty, it is ignored.

% 

% INPUT ARGUMENTS

%

%   x          (vector) The scalar values to which the normalization      

%                       operation is applied.

%                     

%   method              The normalization specification.

%              (string) Identifier for a normalization method: 'var', 

%                       'range', 'log', 'logistic', 'histD' or 'histC'. 

%                       Corresponding default normalization struct is created.

%              (struct) normalization struct 

%              (struct array) of normalization structs, applied to 

%                       x one after the other

%              (cellstr) of length 

%              (cellstr array) first string gives normalization operation, and 

%                       the second gives denormalization operation, with x 

%                       representing the variable, for example: 

%                       {'x+2','x-2}, or {'exp(-x)','-log(x)'} or {'round(x)'}.

%                       Note that in the last case, no denorm operation is 

%                       defined. 

%

%               note: if the method is given as struct(s), it is

%                     applied (done or undone, as specified by operation)

%                     regardless of what the value of '.status' field

%                     is in the struct(s). Only if the status is

%                     'uninit', the undoing operation is halted.

%                     Anyhow, the '.status' fields in the returned 

%                     normalization struct(s) is set to approriate value.

%   

%   operation  (string) The operation to perform: 'init' to initialize

%                       the normalization struct, 'do' to perform the 

%                       normalization, 'undo' to undo the normalization, 

%                       if possible. If operation 'do' is given, but the

%                       normalization struct has not yet been initialized,

%                       it is initialized using the given data (x).

%

% OUTPUT ARGUMENTS

% 

%   x        (vector) Appropriately processed values. 

%

%   sNorm    (struct) Updated normalization struct/struct array. If any,

%                     the '.status' and '.params' fields are updated.

% 

% EXAMPLES

%

%  To initialize and apply a normalization on a set of scalar values: 

%

%    [x_new,sN] = som_norm_variable(x_old,'var','do'); 

%

%  To just initialize, use: 

% 

%    [dummy,sN] = som_norm_variable(x_old,'var','init'); 

% 

%  To undo the normalization(s): 

%

%    x_orig = som_norm_variable(x_new,sN,'undo');

%

%  Typically, normalizations of data structs/sets are handled using

%  functions SOM_NORMALIZE and SOM_DENORMALIZE. However, when only the

%  values of a single variable are of interest, SOM_NORM_VARIABLE may 

%  be useful. For example, assume one wants to apply the normalization

%  done on a component (i) of a data struct (sD) to a new set of values 

%  (x) of that component. With SOM_NORM_VARIABLE this can be done with: 

%

%    x_new = som_norm_variable(x,sD.comp_norm{i},'do'); 

% 

%  Now, as the normalizations in sD.comp_norm{i} have already been 

%  initialized with the original data set (presumably sD.data), 

%  the EXACTLY SAME normalization(s) can be applied to the new values.

%  The same thing can be done with SOM_NORMALIZE function, too: 

%

%    x_new = som_normalize(x,sD.comp_norm{i}); 

%

%  Or, if the new data set were in variable D - a matrix of same

%  dimension as the original data set: 

%

%    D_new = som_normalize(D,sD,i);

%

% SEE ALSO

%  

%  som_normalize    Add/apply/redo normalizations for a data struct/set.

%  som_denormalize  Undo normalizations of a data struct/set.



% Copyright (c) 1998-2000 by the SOM toolbox programming team.

% http://www.cis.hut.fi/projects/somtoolbox/



% Version 2.0beta juuso 151199 170400 150500



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% check arguments



error(nargchk(3, 3, nargin));  % check no. of input arguments is correct



% method

sNorm = []; 

if ischar(method) 

  if any(strcmp(method,{'var','range','log','logistic','histD','histC'})), 

    sNorm = som_set('som_norm','method',method);

  else 

    method = cellstr(method); 

  end

end

if iscell(method), 

  if length(method)==1 & isstruct(method{1}), sNorm = method{1}; 

  else

    if length(method)==1 | isempty(method{2}), method{2} = 'x'; end

    sNorm = som_set('som_norm','method','eval','params',method);

  end

else 

  sNorm = method; 

end



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% action



order = [1:length(sNorm)]; 

if length(order)>1 & strcmp(operation,'undo'), order = order(end:-1:1); end



for i=order, 



  % initialize

  if strcmp(operation,'init') | ...

     (strcmp(operation,'do') & strcmp(sNorm(i).status,'uninit')), 



    % case method = 'hist'

    if strcmp(sNorm(i).method,'hist'), 

      inds = find(~isnan(x) & ~isinf(x));

      if length(unique(x(inds)))>20, sNorm(i).method = 'histC'; 

      else sNorm{i}.method = 'histD'; end

    end



    switch(sNorm(i).method), 

    case 'var',   params = norm_variance_init(x);

    case 'range', params = norm_scale01_init(x);

    case 'log',   params = norm_log_init(x);

    case 'logistic', params = norm_logistic_init(x);

    case 'histD', params = norm_histeqD_init(x);

    case 'histC', params = norm_histeqC_init(x);

    case 'eval',  params = sNorm(i).params; 

    otherwise, 

      error(['Unrecognized method: ' sNorm(i).method]); 

    end

    sNorm(i).params = params;

    sNorm(i).status = 'undone';

  end



  % do / undo

  if strcmp(operation,'do'), 

    switch(sNorm(i).method), 

    case 'var',   x = norm_scale_do(x,sNorm(i).params);

    case 'range', x = norm_scale_do(x,sNorm(i).params);

    case 'log',   x = norm_log_do(x,sNorm(i).params);

    case 'logistic', x = norm_logistic_do(x,sNorm(i).params);

    case 'histD', x = norm_histeqD_do(x,sNorm(i).params);

    case 'histC', x = norm_histeqC_do(x,sNorm(i).params);

    case 'eval',  x = norm_eval_do(x,sNorm(i).params);

    otherwise, 

      error(['Unrecognized method: ' sNorm(i).method]);

    end

    sNorm(i).status = 'done';



  elseif strcmp(operation,'undo'), 



    if strcmp(sNorm(i).status,'uninit'), 

      warning('Could not undo: uninitialized normalization struct.')

      break;

    end

    switch(sNorm(i).method), 

    case 'var',   x = norm_scale_undo(x,sNorm(i).params);

    case 'range', x = norm_scale_undo(x,sNorm(i).params);

    case 'log',   x = norm_log_undo(x,sNorm(i).params);    

    case 'logistic', x = norm_logistic_undo(x,sNorm(i).params);

    case 'histD', x = norm_histeqD_undo(x,sNorm(i).params);

    case 'histC', x = norm_histeqC_undo(x,sNorm(i).params);

    case 'eval',  x = norm_eval_undo(x,sNorm(i).params);

    otherwise, 

      error(['Unrecognized method: ' sNorm(i).method]);

    end

    sNorm(i).status = 'undone';



  elseif ~strcmp(operation,'init'),



    error(['Unrecognized operation: ' operation])



  end

end  



return;



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% subfunctions



% linear scaling



function p = norm_variance_init(x)

  inds = find(~isnan(x) & isfinite(x));

  p = [mean(x(inds)), std(x(inds))];

  if p(2) == 0, p(2) = 1; end

  %end of norm_variance_init



function p = norm_scale01_init(x)

  inds = find(~isnan(x) & isfinite(x));

  mi = min(x(inds)); 

  ma = max(x(inds));

  if mi == ma, p = [mi, 1]; else p = [mi, ma-mi]; end

  %end of norm_scale01_init  



function x = norm_scale_do(x,p)

  x = (x - p(1)) / p(2);

  % end of norm_scale_do



function x = norm_scale_undo(x,p)

  x = x * p(2) + p(1);

  % end of norm_scale_undo



% logarithm



function p = norm_log_init(x)

  inds = find(~isnan(x) & isfinite(x));

  p = min(x(inds));

  % end of norm_log_init



function x = norm_log_do(x,p)

  x = log(x - p +1); 

  % if any(~isreal(x)), ok = 0; end

  % end of norm_log_do 



function x = norm_log_undo(x,p)

  x = exp(x) -1 + p; 

  % end of norm_log_undo 



% logistic



function p = norm_logistic_init(x)

  inds = find(~isnan(x) & isfinite(x));

  p = [mean(x(inds)), std(x(inds))];

  if p(2)==0, p(2) = 1; end

  % end of norm_logistic_init



function x = norm_logistic_do(x,p)

  x = (x-p(1))/p(2);

  x = 1./(1+exp(-x));

  % end of norm_logistic_do



function x = norm_logistic_undo(x,p)

  x = log(x./(1-x));

  x = x*p(2)+p(1);

  % end of norm_logistic_undo



% histogram equalization for discrete values



function p = norm_histeqD_init(x)

  inds = find(~isnan(x) & ~isinf(x));

  p = unique(x(inds));

  % end of norm_histeqD_init



function x = norm_histeqD_do(x,p)

  bins = length(p);

  inds = find(~isnan(x) & ~isinf(x))';

  for i = inds, 

    [dummy ind] = min(abs(x(i) - p));

    % data item closer to the left-hand bin wall is indexed after RH wall

    if x(i) > p(ind) & ind < bins, 

      x(i) = ind + 1;  

    else 

      x(i) = ind;

    end

  end

  x = (x-1)/(bins-1); % normalization between [0,1]

  % end of norm_histeqD_do 



function x = norm_histeqD_undo(x,p)

  bins = length(p);

  x = round(x*(bins-1)+1);

  inds = find(~isnan(x) & ~isinf(x));

  x(inds) = p(x(inds));

  % end of norm_histeqD_undo



% histogram equalization with partially linear functions



function p = norm_histeqC_init(x)

  % investigate x

  inds = find(~isnan(x) & ~isinf(x));

  samples = length(inds);

  xs = unique(x(inds));

  mi = xs(1);

  ma = xs(end);

  % decide number of limits

  lims = ceil(sqrt(length(xs))); % 2->2,100->10,1000->32,10000->100

  % decide limits

  if lims==1,     

    p = [mi, mi+1];

    lims = 2; 

  elseif lims==2, 

    p = [mi, ma];

  else

    p = zeros(lims,1);   

    p(1) = mi; 

    p(end) = ma;

    binsize = zeros(lims-1,1); b = 1; avebinsize = samples/(lims-1);

    for i=1:(length(xs)-1), 

      binsize(b) = binsize(b) + sum(x==xs(i)); 

      if binsize(b) >= avebinsize, 

        b = b + 1;

        p(b) = (xs(i)+xs(i+1))/2;

      end

      if b==(lims-1), 

        binsize(b) = samples-sum(binsize); break;

      else

        avebinsize = (samples-sum(binsize))/(lims-1-b);

      end

    end

  end

  % end of norm_histeqC_init



function x = norm_histeqC_do(x,p)

  xnew = x; 

  lims = length(p);

  % handle values below minimum

  r = p(2)-p(1); 

  inds = find(x<=p(1) & isfinite(x)); 

  if any(inds), xnew(inds) = 0-(p(1)-x(inds))/r; end 

  % handle values above maximum

  r = p(end)-p(end-1); 

  inds = find(x>p(end) & isfinite(x)); 

  if any(inds), xnew(inds) = lims-1+(x(inds)-p(end))/r; end

  % handle all other values

  for i=1:(lims-1), 

    r0 = p(i); r1 = p(i+1); r = r1-r0; 

    inds = find(x>r0 & x<=r1); 

    if any(inds), xnew(inds) = i-1+(x(inds)-r0)/r; end

  end

  % scale so that minimum and maximum correspond to 0 and 1

  x = xnew/(lims-1);

  % end of norm_histeqC_do



function x = norm_histeqC_undo(x,p)

  xnew = x; 

  lims = length(p); 

  % scale so that 0 and 1 correspond to minimum and maximum

  x = x*(lims-1);



  % handle values below minimum

  r = p(2)-p(1); 

  inds = find(x<=0 & isfinite(x)); 

  if any(inds), xnew(inds) = x(inds)*r + p(1); end 

  % handle values above maximum

  r = p(end)-p(end-1); 

  inds = find(x>lims-1 & isfinite(x)); 

  if any(inds), xnew(inds) = (x(inds)-(lims-1))*r+p(end); end

  % handle all other values

  for i=1:(lims-1), 

    r0 = p(i); r1 = p(i+1); r = r1-r0; 

    inds = find(x>i-1 & x<=i); 

    if any(inds), xnew(inds) = (x(inds)-(i-1))*r + r0; end

  end

  x = xnew;

  % end of norm_histeqC_undo



% eval



function p = norm_eval_init(method)

  p = method;

  %end of norm_eval_init



function x = norm_eval_do(x,p)

  x_tmp = eval(p{1});

  if size(x_tmp,1)>=1 & size(x,1)>=1 & ...

     size(x_tmp,2)==1 & size(x,2)==1,

    x = x_tmp;

  end

  %end of norm_eval_do



function x = norm_eval_undo(x,p)

  x_tmp = eval(p{2});

  if size(x_tmp,1)>=1 & size(x,1)>=1 & ...

     size(x_tmp,2)==1 & size(x,2)==1,

    x = x_tmp;

  end

  %end of norm_eval_undo



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%