PageRenderTime 27ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/code_oth/som_norm_variable.m

http://research-code-base-animesh.googlecode.com/
MATLAB | 533 lines | 240 code | 37 blank | 256 comment | 38 complexity | 680db73cc1cdf69de85f4bd5b4dcb75a MD5 | raw file
  1. function [x,sNorm] = som_norm_variable(x, method, operation)
  2. %SOM_NORM_VARIABLE Normalize or denormalize a scalar variable.
  3. %
  4. % [x,sNorm] = som_norm_variable(x, method, operation)
  5. %
  6. % xnew = som_norm_variable(x,'var','do');
  7. % [dummy,sN] = som_norm_variable(x,'log','init');
  8. % [xnew,sN] = som_norm_variable(x,sN,'do');
  9. % xorig = som_norm_variable(xnew,sN,'undo');
  10. %
  11. % Input and output arguments:
  12. % x (vector) a set of values of a scalar variable for
  13. % which the (de)normalization is performed.
  14. % The processed values are returned.
  15. % method (string) identifier for a normalization method: 'var',
  16. % 'range', 'log', 'logistic', 'histD', or 'histC'.
  17. % A normalization struct with default values is created.
  18. % (struct) normalization struct, or an array of such
  19. % (cellstr) first string gives normalization operation, and the
  20. % second gives denormalization operation, with x
  21. % representing the variable, for example:
  22. % {'x+2','x-2}, or {'exp(-x)','-log(x)'} or {'round(x)'}.
  23. % Note that in the last case, no denorm operation is
  24. % defined.
  25. % operation (string) the operation to be performed: 'init', 'do' or 'undo'
  26. %
  27. % sNorm (struct) updated normalization struct/struct array
  28. %
  29. % For more help, try 'type som_norm_variable' or check out online documentation.
  30. % See also SOM_NORMALIZE, SOM_DENORMALIZE.
  31. %%%%%%%%%%%%% DETAILED DESCRIPTION %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  32. %
  33. % som_norm_variable
  34. %
  35. % PURPOSE
  36. %
  37. % Initialize, apply and undo normalizations on a given vector of
  38. % scalar values.
  39. %
  40. % SYNTAX
  41. %
  42. % xnew = som_norm_variable(x,method,operation)
  43. % xnew = som_norm_variable(x,sNorm,operation)
  44. % [xnew,sNorm] = som_norm_variable(...)
  45. %
  46. % DESCRIPTION
  47. %
  48. % This function is used to initialize, apply and undo normalizations
  49. % on scalar variables. It is the low-level function that upper-level
  50. % functions SOM_NORMALIZE and SOM_DENORMALIZE utilize to actually (un)do
  51. % the normalizations.
  52. %
  53. % Normalizations are typically performed to control the variance of
  54. % vector components. If some vector components have variance which is
  55. % significantly higher than the variance of other components, those
  56. % components will dominate the map organization. Normalization of
  57. % the variance of vector components (method 'var') is used to prevent
  58. % that. In addition to variance normalization, other methods have
  59. % been implemented as well (see list below).
  60. %
  61. % Usually normalizations convert the variable values so that they no
  62. % longer make any sense: the values are still ordered, but their range
  63. % may have changed so radically that interpreting the numbers in the
  64. % original context is very hard. For this reason all implemented methods
  65. % are (more or less) revertible. The normalizations are monotonic
  66. % and information is saved so that they can be undone. Also, the saved
  67. % information makes it possible to apply the EXACTLY SAME normalization
  68. % to another set of values. The normalization information is determined
  69. % with 'init' operation, while 'do' and 'undo' operations are used to
  70. % apply or revert the normalization.
  71. %
  72. % The normalization information is saved in a normalization struct,
  73. % which is returned as the second argument of this function. Note that
  74. % normalization operations may be stacked. In this case, normalization
  75. % structs are positioned in a struct array. When applied, the array is
  76. % gone through from start to end, and when undone, in reverse order.
  77. %
  78. % method description
  79. %
  80. % 'var' Variance normalization. A linear transformation which
  81. % scales the values such that their variance=1. This is
  82. % convenient way to use Mahalanobis distance measure without
  83. % actually changing the distance calculation procedure.
  84. %
  85. % 'range' Normalization of range of values. A linear transformation
  86. % which scales the values between [0,1].
  87. %
  88. % 'log' Logarithmic normalization. In many cases the values of
  89. % a vector component are exponentially distributed. This
  90. % normalization is a good way to get more resolution to
  91. % (the low end of) that vector component. What this
  92. % actually does is a non-linear transformation:
  93. % x_new = log(x_old - m + 1)
  94. % where m=min(x_old) and log is the natural logarithm.
  95. % Applying the transformation to a value which is lower
  96. % than m-1 will give problems, as the result is then complex.
  97. % If the minimum for values is known a priori,
  98. % it might be a good idea to initialize the normalization with
  99. % [dummy,sN] = som_norm_variable(minimum,'log','init');
  100. % and normalize only after this:
  101. % x_new = som_norm_variable(x,sN,'do');
  102. %
  103. % 'logistic' or softmax normalization. This normalization ensures
  104. % that all values in the future, too, are within the range
  105. % [0,1]. The transformation is more-or-less linear in the
  106. % middle range (around mean value), and has a smooth
  107. % nonlinearity at both ends which ensures that all values
  108. % are within the range. The data is first scaled as in
  109. % variance normalization:
  110. % x_scaled = (x_old - mean(x_old))/std(x_old)
  111. % and then transformed with the logistic function
  112. % x_new = 1/(1+exp(-x_scaled))
  113. %
  114. % 'histD' Discrete histogram equalization. Non-linear. Orders the
  115. % values and replaces each value by its ordinal number.
  116. % Finally, scales the values such that they are between [0,1].
  117. % Useful for both discrete and continuous variables, but as
  118. % the saved normalization information consists of all
  119. % unique values of the initialization data set, it may use
  120. % considerable amounts of memory. If the variable can get
  121. % more than a few values (say, 20), it might be better to
  122. % use 'histC' method below. Another important note is that
  123. % this method is not exactly revertible if it is applied
  124. % to values which are not part of the original value set.
  125. %
  126. % 'histC' Continuous histogram equalization. Actually, a partially
  127. % linear transformation which tries to do something like
  128. % histogram equalization. The value range is divided to
  129. % a number of bins such that the number of values in each
  130. % bin is (almost) the same. The values are transformed
  131. % linearly in each bin. For example, values in bin number 3
  132. % are scaled between [3,4[. Finally, all values are scaled
  133. % between [0,1]. The number of bins is the square root
  134. % of the number of unique values in the initialization set,
  135. % rounded up. The resulting histogram equalization is not
  136. % as good as the one that 'histD' makes, but the benefit
  137. % is that it is exactly revertible - even outside the
  138. % original value range (although the results may be funny).
  139. %
  140. % 'eval' With this method, freeform normalization operations can be
  141. % specified. The parameter field contains strings to be
  142. % evaluated with 'eval' function, with variable name 'x'
  143. % representing the variable itself. The first string is
  144. % the normalization operation, and the second is a
  145. % denormalization operation. If the denormalization operation
  146. % is empty, it is ignored.
  147. %
  148. % INPUT ARGUMENTS
  149. %
  150. % x (vector) The scalar values to which the normalization
  151. % operation is applied.
  152. %
  153. % method The normalization specification.
  154. % (string) Identifier for a normalization method: 'var',
  155. % 'range', 'log', 'logistic', 'histD' or 'histC'.
  156. % Corresponding default normalization struct is created.
  157. % (struct) normalization struct
  158. % (struct array) of normalization structs, applied to
  159. % x one after the other
  160. % (cellstr) of length
  161. % (cellstr array) first string gives normalization operation, and
  162. % the second gives denormalization operation, with x
  163. % representing the variable, for example:
  164. % {'x+2','x-2}, or {'exp(-x)','-log(x)'} or {'round(x)'}.
  165. % Note that in the last case, no denorm operation is
  166. % defined.
  167. %
  168. % note: if the method is given as struct(s), it is
  169. % applied (done or undone, as specified by operation)
  170. % regardless of what the value of '.status' field
  171. % is in the struct(s). Only if the status is
  172. % 'uninit', the undoing operation is halted.
  173. % Anyhow, the '.status' fields in the returned
  174. % normalization struct(s) is set to approriate value.
  175. %
  176. % operation (string) The operation to perform: 'init' to initialize
  177. % the normalization struct, 'do' to perform the
  178. % normalization, 'undo' to undo the normalization,
  179. % if possible. If operation 'do' is given, but the
  180. % normalization struct has not yet been initialized,
  181. % it is initialized using the given data (x).
  182. %
  183. % OUTPUT ARGUMENTS
  184. %
  185. % x (vector) Appropriately processed values.
  186. %
  187. % sNorm (struct) Updated normalization struct/struct array. If any,
  188. % the '.status' and '.params' fields are updated.
  189. %
  190. % EXAMPLES
  191. %
  192. % To initialize and apply a normalization on a set of scalar values:
  193. %
  194. % [x_new,sN] = som_norm_variable(x_old,'var','do');
  195. %
  196. % To just initialize, use:
  197. %
  198. % [dummy,sN] = som_norm_variable(x_old,'var','init');
  199. %
  200. % To undo the normalization(s):
  201. %
  202. % x_orig = som_norm_variable(x_new,sN,'undo');
  203. %
  204. % Typically, normalizations of data structs/sets are handled using
  205. % functions SOM_NORMALIZE and SOM_DENORMALIZE. However, when only the
  206. % values of a single variable are of interest, SOM_NORM_VARIABLE may
  207. % be useful. For example, assume one wants to apply the normalization
  208. % done on a component (i) of a data struct (sD) to a new set of values
  209. % (x) of that component. With SOM_NORM_VARIABLE this can be done with:
  210. %
  211. % x_new = som_norm_variable(x,sD.comp_norm{i},'do');
  212. %
  213. % Now, as the normalizations in sD.comp_norm{i} have already been
  214. % initialized with the original data set (presumably sD.data),
  215. % the EXACTLY SAME normalization(s) can be applied to the new values.
  216. % The same thing can be done with SOM_NORMALIZE function, too:
  217. %
  218. % x_new = som_normalize(x,sD.comp_norm{i});
  219. %
  220. % Or, if the new data set were in variable D - a matrix of same
  221. % dimension as the original data set:
  222. %
  223. % D_new = som_normalize(D,sD,i);
  224. %
  225. % SEE ALSO
  226. %
  227. % som_normalize Add/apply/redo normalizations for a data struct/set.
  228. % som_denormalize Undo normalizations of a data struct/set.
  229. % Copyright (c) 1998-2000 by the SOM toolbox programming team.
  230. % http://www.cis.hut.fi/projects/somtoolbox/
  231. % Version 2.0beta juuso 151199 170400 150500
  232. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  233. %% check arguments
  234. error(nargchk(3, 3, nargin)); % check no. of input arguments is correct
  235. % method
  236. sNorm = [];
  237. if ischar(method)
  238. if any(strcmp(method,{'var','range','log','logistic','histD','histC'})),
  239. sNorm = som_set('som_norm','method',method);
  240. else
  241. method = cellstr(method);
  242. end
  243. end
  244. if iscell(method),
  245. if length(method)==1 & isstruct(method{1}), sNorm = method{1};
  246. else
  247. if length(method)==1 | isempty(method{2}), method{2} = 'x'; end
  248. sNorm = som_set('som_norm','method','eval','params',method);
  249. end
  250. else
  251. sNorm = method;
  252. end
  253. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  254. %% action
  255. order = [1:length(sNorm)];
  256. if length(order)>1 & strcmp(operation,'undo'), order = order(end:-1:1); end
  257. for i=order,
  258. % initialize
  259. if strcmp(operation,'init') | ...
  260. (strcmp(operation,'do') & strcmp(sNorm(i).status,'uninit')),
  261. % case method = 'hist'
  262. if strcmp(sNorm(i).method,'hist'),
  263. inds = find(~isnan(x) & ~isinf(x));
  264. if length(unique(x(inds)))>20, sNorm(i).method = 'histC';
  265. else sNorm{i}.method = 'histD'; end
  266. end
  267. switch(sNorm(i).method),
  268. case 'var', params = norm_variance_init(x);
  269. case 'range', params = norm_scale01_init(x);
  270. case 'log', params = norm_log_init(x);
  271. case 'logistic', params = norm_logistic_init(x);
  272. case 'histD', params = norm_histeqD_init(x);
  273. case 'histC', params = norm_histeqC_init(x);
  274. case 'eval', params = sNorm(i).params;
  275. otherwise,
  276. error(['Unrecognized method: ' sNorm(i).method]);
  277. end
  278. sNorm(i).params = params;
  279. sNorm(i).status = 'undone';
  280. end
  281. % do / undo
  282. if strcmp(operation,'do'),
  283. switch(sNorm(i).method),
  284. case 'var', x = norm_scale_do(x,sNorm(i).params);
  285. case 'range', x = norm_scale_do(x,sNorm(i).params);
  286. case 'log', x = norm_log_do(x,sNorm(i).params);
  287. case 'logistic', x = norm_logistic_do(x,sNorm(i).params);
  288. case 'histD', x = norm_histeqD_do(x,sNorm(i).params);
  289. case 'histC', x = norm_histeqC_do(x,sNorm(i).params);
  290. case 'eval', x = norm_eval_do(x,sNorm(i).params);
  291. otherwise,
  292. error(['Unrecognized method: ' sNorm(i).method]);
  293. end
  294. sNorm(i).status = 'done';
  295. elseif strcmp(operation,'undo'),
  296. if strcmp(sNorm(i).status,'uninit'),
  297. warning('Could not undo: uninitialized normalization struct.')
  298. break;
  299. end
  300. switch(sNorm(i).method),
  301. case 'var', x = norm_scale_undo(x,sNorm(i).params);
  302. case 'range', x = norm_scale_undo(x,sNorm(i).params);
  303. case 'log', x = norm_log_undo(x,sNorm(i).params);
  304. case 'logistic', x = norm_logistic_undo(x,sNorm(i).params);
  305. case 'histD', x = norm_histeqD_undo(x,sNorm(i).params);
  306. case 'histC', x = norm_histeqC_undo(x,sNorm(i).params);
  307. case 'eval', x = norm_eval_undo(x,sNorm(i).params);
  308. otherwise,
  309. error(['Unrecognized method: ' sNorm(i).method]);
  310. end
  311. sNorm(i).status = 'undone';
  312. elseif ~strcmp(operation,'init'),
  313. error(['Unrecognized operation: ' operation])
  314. end
  315. end
  316. return;
  317. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  318. %% subfunctions
  319. % linear scaling
  320. function p = norm_variance_init(x)
  321. inds = find(~isnan(x) & isfinite(x));
  322. p = [mean(x(inds)), std(x(inds))];
  323. if p(2) == 0, p(2) = 1; end
  324. %end of norm_variance_init
  325. function p = norm_scale01_init(x)
  326. inds = find(~isnan(x) & isfinite(x));
  327. mi = min(x(inds));
  328. ma = max(x(inds));
  329. if mi == ma, p = [mi, 1]; else p = [mi, ma-mi]; end
  330. %end of norm_scale01_init
  331. function x = norm_scale_do(x,p)
  332. x = (x - p(1)) / p(2);
  333. % end of norm_scale_do
  334. function x = norm_scale_undo(x,p)
  335. x = x * p(2) + p(1);
  336. % end of norm_scale_undo
  337. % logarithm
  338. function p = norm_log_init(x)
  339. inds = find(~isnan(x) & isfinite(x));
  340. p = min(x(inds));
  341. % end of norm_log_init
  342. function x = norm_log_do(x,p)
  343. x = log(x - p +1);
  344. % if any(~isreal(x)), ok = 0; end
  345. % end of norm_log_do
  346. function x = norm_log_undo(x,p)
  347. x = exp(x) -1 + p;
  348. % end of norm_log_undo
  349. % logistic
  350. function p = norm_logistic_init(x)
  351. inds = find(~isnan(x) & isfinite(x));
  352. p = [mean(x(inds)), std(x(inds))];
  353. if p(2)==0, p(2) = 1; end
  354. % end of norm_logistic_init
  355. function x = norm_logistic_do(x,p)
  356. x = (x-p(1))/p(2);
  357. x = 1./(1+exp(-x));
  358. % end of norm_logistic_do
  359. function x = norm_logistic_undo(x,p)
  360. x = log(x./(1-x));
  361. x = x*p(2)+p(1);
  362. % end of norm_logistic_undo
  363. % histogram equalization for discrete values
  364. function p = norm_histeqD_init(x)
  365. inds = find(~isnan(x) & ~isinf(x));
  366. p = unique(x(inds));
  367. % end of norm_histeqD_init
  368. function x = norm_histeqD_do(x,p)
  369. bins = length(p);
  370. inds = find(~isnan(x) & ~isinf(x))';
  371. for i = inds,
  372. [dummy ind] = min(abs(x(i) - p));
  373. % data item closer to the left-hand bin wall is indexed after RH wall
  374. if x(i) > p(ind) & ind < bins,
  375. x(i) = ind + 1;
  376. else
  377. x(i) = ind;
  378. end
  379. end
  380. x = (x-1)/(bins-1); % normalization between [0,1]
  381. % end of norm_histeqD_do
  382. function x = norm_histeqD_undo(x,p)
  383. bins = length(p);
  384. x = round(x*(bins-1)+1);
  385. inds = find(~isnan(x) & ~isinf(x));
  386. x(inds) = p(x(inds));
  387. % end of norm_histeqD_undo
  388. % histogram equalization with partially linear functions
  389. function p = norm_histeqC_init(x)
  390. % investigate x
  391. inds = find(~isnan(x) & ~isinf(x));
  392. samples = length(inds);
  393. xs = unique(x(inds));
  394. mi = xs(1);
  395. ma = xs(end);
  396. % decide number of limits
  397. lims = ceil(sqrt(length(xs))); % 2->2,100->10,1000->32,10000->100
  398. % decide limits
  399. if lims==1,
  400. p = [mi, mi+1];
  401. lims = 2;
  402. elseif lims==2,
  403. p = [mi, ma];
  404. else
  405. p = zeros(lims,1);
  406. p(1) = mi;
  407. p(end) = ma;
  408. binsize = zeros(lims-1,1); b = 1; avebinsize = samples/(lims-1);
  409. for i=1:(length(xs)-1),
  410. binsize(b) = binsize(b) + sum(x==xs(i));
  411. if binsize(b) >= avebinsize,
  412. b = b + 1;
  413. p(b) = (xs(i)+xs(i+1))/2;
  414. end
  415. if b==(lims-1),
  416. binsize(b) = samples-sum(binsize); break;
  417. else
  418. avebinsize = (samples-sum(binsize))/(lims-1-b);
  419. end
  420. end
  421. end
  422. % end of norm_histeqC_init
  423. function x = norm_histeqC_do(x,p)
  424. xnew = x;
  425. lims = length(p);
  426. % handle values below minimum
  427. r = p(2)-p(1);
  428. inds = find(x<=p(1) & isfinite(x));
  429. if any(inds), xnew(inds) = 0-(p(1)-x(inds))/r; end
  430. % handle values above maximum
  431. r = p(end)-p(end-1);
  432. inds = find(x>p(end) & isfinite(x));
  433. if any(inds), xnew(inds) = lims-1+(x(inds)-p(end))/r; end
  434. % handle all other values
  435. for i=1:(lims-1),
  436. r0 = p(i); r1 = p(i+1); r = r1-r0;
  437. inds = find(x>r0 & x<=r1);
  438. if any(inds), xnew(inds) = i-1+(x(inds)-r0)/r; end
  439. end
  440. % scale so that minimum and maximum correspond to 0 and 1
  441. x = xnew/(lims-1);
  442. % end of norm_histeqC_do
  443. function x = norm_histeqC_undo(x,p)
  444. xnew = x;
  445. lims = length(p);
  446. % scale so that 0 and 1 correspond to minimum and maximum
  447. x = x*(lims-1);
  448. % handle values below minimum
  449. r = p(2)-p(1);
  450. inds = find(x<=0 & isfinite(x));
  451. if any(inds), xnew(inds) = x(inds)*r + p(1); end
  452. % handle values above maximum
  453. r = p(end)-p(end-1);
  454. inds = find(x>lims-1 & isfinite(x));
  455. if any(inds), xnew(inds) = (x(inds)-(lims-1))*r+p(end); end
  456. % handle all other values
  457. for i=1:(lims-1),
  458. r0 = p(i); r1 = p(i+1); r = r1-r0;
  459. inds = find(x>i-1 & x<=i);
  460. if any(inds), xnew(inds) = (x(inds)-(i-1))*r + r0; end
  461. end
  462. x = xnew;
  463. % end of norm_histeqC_undo
  464. % eval
  465. function p = norm_eval_init(method)
  466. p = method;
  467. %end of norm_eval_init
  468. function x = norm_eval_do(x,p)
  469. x_tmp = eval(p{1});
  470. if size(x_tmp,1)>=1 & size(x,1)>=1 & ...
  471. size(x_tmp,2)==1 & size(x,2)==1,
  472. x = x_tmp;
  473. end
  474. %end of norm_eval_do
  475. function x = norm_eval_undo(x,p)
  476. x_tmp = eval(p{2});
  477. if size(x_tmp,1)>=1 & size(x,1)>=1 & ...
  478. size(x_tmp,2)==1 & size(x,2)==1,
  479. x = x_tmp;
  480. end
  481. %end of norm_eval_undo
  482. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%