PageRenderTime 49ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/human_genome_variation/lps.xml

https://bitbucket.org/cistrome/cistrome-harvard/
XML | 304 lines | 244 code | 36 blank | 24 comment | 0 complexity | 301e74897a8daebcaec0d12ff9422e16 MD5 | raw file
  1. <tool id="hgv_lps" name="LPS" version="1.0.0">
  2. <description>LASSO-Patternsearch algorithm</description>
  3. <command interpreter="bash">
  4. lps_tool_wrapper.sh $lambda_fac $input_file $label_column $output_file $log_file
  5. Initialization 0
  6. #if $advanced.options == "true":
  7. Sample $advanced.sample
  8. Verbosity $advanced.verbosity
  9. Standardize $advanced.standardize
  10. initialLambda $advanced.initialLambda
  11. #if $advanced.continuation.continuation == "1":
  12. Continuation $advanced.continuation.continuation
  13. continuationSteps $advanced.continuation.continuationSteps
  14. accurateIntermediates $advanced.continuation.accurateIntermediates
  15. #end if
  16. printFreq $advanced.printFreq
  17. #if $advanced.newton.newton == "1":
  18. Newton $advanced.newton.newton
  19. NewtonThreshold $advanced.newton.newtonThreshold
  20. #end if
  21. HessianSampleFraction $advanced.hessianSampleFraction
  22. BB 0
  23. Monotone 0
  24. FullGradient $advanced.fullGradient
  25. GradientFraction $advanced.gradientFraction
  26. InitialAlpha $advanced.initialAlpha
  27. AlphaIncrease $advanced.alphaIncrease
  28. AlphaDecrease $advanced.alphaDecrease
  29. AlphaMax $advanced.alphaMax
  30. c1 $advanced.c1
  31. MaxIter $advanced.maxIter
  32. StopTol $advanced.stopTol
  33. IntermediateTol $advanced.intermediateTol
  34. FinalOnly $advanced.finalOnly
  35. #end if
  36. </command>
  37. <inputs>
  38. <param name="input_file" type="data" format="tabular" label="Dataset"/>
  39. <param name="label_column" type="data_column" data_ref="input_file" numerical="true" label="Label column" help="Column containing outcome labels: +1 or -1."/>
  40. <param name="lambda_fac" label="Lambda_fac" type="float" value="0.03" help="Target value of the regularization parameter, expressed as a fraction of the calculated lambda_max.">
  41. <validator type="in_range" message="0.00 &lt; lambda_fac &lt;= 1.00" min="0.00" max="1.00"/>
  42. </param>
  43. <conditional name="advanced">
  44. <param name="options" type="select" label="Advanced Options">
  45. <option value="false" selected="true">Hide advanced options</option>
  46. <option value="true">Show advanced options</option>
  47. </param>
  48. <when value="false">
  49. <!-- no options -->
  50. </when>
  51. <when value="true">
  52. <!-- HARDCODED: 'Sample' we don't support passing an array -->
  53. <param name="sample" type="float" value="1.0" label="Sample fraction" help="Sample this fraction of the data set.">
  54. <validator type="in_range" message="0.0 &lt;= sample &lt;= 1.0" min="0.0" max="1.0"/>
  55. </param>
  56. <!-- HARDCODED: 'Initialization' = 0 :: Initialize at beta=0 -->
  57. <param name="verbosity" type="select" format="integer" label="Verbosity">
  58. <option value="0" selected="true">Little output</option>
  59. <option value="1">More output</option>
  60. <option value="2">Still more output</option>
  61. </param>
  62. <param name="standardize" type="select" format="integer" label="Standardize" help="Scales and shifts each column so that it has mean zero and variance 1.">
  63. <option value="0" selected="true">Don't standardize</option>
  64. <option value="1">Standardize</option>
  65. </param>
  66. <param name="initialLambda" type="float" value="0.8" label="Initial lambda" help="First value of lambda to be used in the continuation scheme, expressed as a fraction of lambda_max.">
  67. <validator type="in_range" message="0.0 &lt; initialLambda &lt; 1.0" min="0.0" max="1.0"/>
  68. </param>
  69. <conditional name="continuation">
  70. <param name="continuation" type="select" format="integer" label="Continuation" help="Use continuation strategy to start with a larger value of lambda, decreasing it successively to lambda_fac.">
  71. <option value="0" selected="true">Don't use continuation</option>
  72. <option value="1">Use continuation</option>
  73. </param>
  74. <when value="0">
  75. <!-- no options -->
  76. </when>
  77. <when value="1">
  78. <param name="continuationSteps" type="integer" value="5" label="Continuation steps" help="Number of lambda values to use in continuation &lt;em&gt;prior&lt;/em&gt; to target value lambda_fac."/>
  79. <param name="accurateIntermediates" type="select" format="integer" label="Accurate intermediates" help="Indicates whether accurate solutions are required for lambda values other than the target value lambda_fac.">
  80. <option value="0" selected="true">Don't need accurate intemediates</option>
  81. <option value="1">Calculate accurate intermediates</option>
  82. </param>
  83. </when>
  84. </conditional> <!-- name="continuation" -->
  85. <param name="printFreq" type="integer" value="1" label="Print frequency" help="Print a progress report every NI iterations, where NI is the supplied value of this parameter.">
  86. <validator type="in_range" message="printFreq &gt;= 1" min="1"/>
  87. </param>
  88. <conditional name="newton">
  89. <param name="newton" type="select" format="integer" label="Projected Newton steps">
  90. <option value="0" selected="true">No Newton steps</option>
  91. <option value="1">Try projected Newton steps</option>
  92. </param>
  93. <when value="0">
  94. <!-- no options -->
  95. </when>
  96. <when value="1">
  97. <param name="newtonThreshold" type="integer" value="500" label="Newton threshold" help="Maximum size of free variable subvector for Newton."/>
  98. </when>
  99. </conditional>
  100. <param name="hessianSampleFraction" type="float" value="1.0" label="Hessian sample fraction" help="Fraction of terms to use in approximate Hessian calculation.">
  101. <validator type="in_range" message="0.01 &lt; hessianSampleFraction &lt;= 1.00" min="0.01" max="1.00"/>
  102. </param>
  103. <!-- HARDCODED: 'BB' = 0 :: don't use Barzilai-Borwein steps -->
  104. <!-- HARDCODED: 'Monotone' = 0 :: don't force monotonicity -->
  105. <param name="fullGradient" type="select" format="integer" label="Partial gradient vector selection">
  106. <option value="0">Use randomly selected partial gradient, including current active components ("biased")</option>
  107. <option value="1">Use full gradient vector at every step</option>
  108. <option value="2">Randomly selected partial gradient, without regard to current active set ("unbiased")</option>
  109. </param>
  110. <param name="gradientFraction" type="float" value="0.1" label="Gradient fraction" help="Fraction of inactive gradient vector to evaluate.">
  111. <validator type="in_range" message="0.0 &lt; gradientFraction &lt;= 1" min="0.0" max="1.0"/>
  112. </param>
  113. <param name="initialAlpha" type="float" value="1.0" label="Initial value of alpha"/>
  114. <param name="alphaIncrease" type="float" value="2.0" label="Alpha increase" help="Factor by which to increase alpha after descent not obtained."/>
  115. <param name="alphaDecrease" type="float" value="0.8" label="Alpha decrease" help="Factor by which to decrease alpha after successful first-order step."/>
  116. <param name="alphaMax" type="float" value="1e12" label="Alpha max" help="Maximum value of alpha; terminate with error if we exceed this."/>
  117. <param name="c1" type="float" value="1e-3" help="Parameter defining the margin by which the first-order step is required to decrease before being taken.">
  118. <validator type="in_range" message="0.0 &lt; c1 &lt; 1.0" min="0.0" max="1.0"/>
  119. </param>
  120. <param name="maxIter" type="integer" value="10000" label="Maximum number of iterations" help="Terminate with error if we exceed this."/>
  121. <param name="stopTol" type="float" value="1e-6" label="Stop tolerance" help="Convergence tolerance for target value of lambda."/>
  122. <param name="intermediateTol" type="float" value="1e-4" label="Intermediate tolerance" help="Convergence tolerance for intermediate values of lambda."/>
  123. <param name="finalOnly" type="select" format="integer" label="Final only">
  124. <option value="0" selected="true">Return information for all intermediate values</option>
  125. <option value="1">Just return information at the last lambda</option>
  126. </param>
  127. </when> <!-- value="advanced" -->
  128. </conditional> <!-- name="advanced" -->
  129. </inputs>
  130. <outputs>
  131. <data name="output_file" format="tabular" label="${tool.name} on ${on_string}: results"/>
  132. <data name="log_file" format="txt" label="${tool.name} on ${on_string}: log"/>
  133. </outputs>
  134. <requirements>
  135. <requirement type="package">lps_tool</requirement>
  136. </requirements>
  137. <tests>
  138. <test>
  139. <param name="input_file" value="lps_arrhythmia.tabular"/>
  140. <param name="label_column" value="280"/>
  141. <param name="lambda_fac" value="0.03"/>
  142. <param name="options" value="true"/>
  143. <param name="sample" value="1.0"/>
  144. <param name="verbosity" value="1"/>
  145. <param name="standardize" value="0"/>
  146. <param name="initialLambda" value="0.9"/>
  147. <param name="continuation" value="1"/>
  148. <param name="continuationSteps" value="10"/>
  149. <param name="accurateIntermediates" value="0"/>
  150. <param name="printFreq" value="1"/>
  151. <param name="newton" value="1"/>
  152. <param name="newtonThreshold" value="500"/>
  153. <param name="hessianSampleFraction" value="1.0"/>
  154. <param name="fullGradient" value="1"/>
  155. <param name="gradientFraction" value="0.5"/>
  156. <param name="initialAlpha" value="1.0"/>
  157. <param name="alphaIncrease" value="2.0"/>
  158. <param name="alphaDecrease" value="0.8"/>
  159. <param name="alphaMax" value="1e12"/>
  160. <param name="c1" value="1e-3"/>
  161. <param name="maxIter" value="2500"/>
  162. <param name="stopTol" value="1e-6"/>
  163. <param name="intermediateTol" value="1e-6"/>
  164. <param name="finalOnly" value="0"/>
  165. <output name="ouput_file" file="lps_arrhythmia_beta.tabular"/>
  166. <output name="log_file" file="lps_arrhythmia_log.txt"/>
  167. </test>
  168. </tests>
  169. <help>
  170. **Dataset formats**
  171. The input and output datasets are tabular_. The columns are described below.
  172. There is a second output dataset (a log) that is in text_ format.
  173. (`Dataset missing?`_)
  174. .. _tabular: ./static/formatHelp.html#tab
  175. .. _text: ./static/formatHelp.html#text
  176. .. _Dataset missing?: ./static/formatHelp.html
  177. -----
  178. **What it does**
  179. The LASSO-Patternsearch algorithm fits your dataset to an L1-regularized
  180. logistic regression model. A benefit of using L1-regularization is
  181. that it typically yields a weight vector with relatively few non-zero
  182. coefficients.
  183. For example, say you have a dataset containing M rows (subjects)
  184. and N columns (attributes) where one of these N attributes is binary,
  185. indicating whether or not the subject has some property of interest P.
  186. In simple terms, LPS calculates a weight for each of the other attributes
  187. in your dataset. This weight indicates how "relevant" that attribute
  188. is for predicting whether or not a given subject has property P.
  189. The L1-regularization causes most of these weights to be equal to zero,
  190. which means LPS will find a "small" subset of the remaining N-1 attributes
  191. in your dataset that can be used to predict P.
  192. In other words, LPS can be used for feature selection.
  193. The input dataset is tabular, and must contain a label column which
  194. indicates whether or not a given row has property P. In the current
  195. version of this tool, P must be encoded using +1 and -1. The Lambda_fac
  196. parameter ranges from 0 to 1, and controls how sparse the weight
  197. vector will be. At the low end, when Lambda_fac = 0, there will be
  198. no regularization. At the high end, when Lambda_fac = 1, there will be
  199. "too much" regularization, and all of the weights will equal zero.
  200. The LPS tool creates two output datasets. The first, called the results
  201. file, is a tabular dataset containing one column of weights for each
  202. value of the regularization parameter lambda that was tried. The weight
  203. columns are in order from left to right by decreasing values of lambda.
  204. The first N-1 rows in each column are the weights for the N-1 attributes
  205. in your input dataset. The final row is a constant, the intercept.
  206. Let **x** be a row from your input dataset and let **b** be a column
  207. from the results file. To compute the probability that row **x** has
  208. a label value of +1:
  209. Probability(row **x** has label value = +1) = 1 / [1 + exp{**x** \* **b**\[1..N-1\] + **b**\[N\]}]
  210. where **x** \* **b**\[1..N-1\] represents matrix multiplication.
  211. The second output dataset, called the log file, is a text file which
  212. contains additional data about the fitted L1-regularized logistic
  213. regression model. These data include the number of features, the
  214. computed value of lambda_max, the actual values of lambda used, the
  215. optimal values of the log-likelihood and regularized log-likelihood
  216. functions, the number of non-zeros, and the number of iterations.
  217. Website: http://pages.cs.wisc.edu/~swright/LPS/
  218. -----
  219. **Example**
  220. - input file::
  221. +1 1 0 0 0 0 1 0 1 1 ...
  222. +1 1 1 1 0 0 1 0 1 1 ...
  223. +1 1 0 1 0 1 0 1 0 1 ...
  224. etc.
  225. - output results file::
  226. 0
  227. 0
  228. 0
  229. 0
  230. 0.025541
  231. etc.
  232. - output log file::
  233. Data set has 100 vectors with 50 features.
  234. calculateLambdaMax: n=50, m=100, m+=50, m-=50
  235. computed value of lambda_max: 5.0000e-01
  236. lambda=2.96e-02 solution:
  237. optimal log-likelihood function value: 6.46e-01
  238. optimal *regularized* log-likelihood function value: 6.79e-01
  239. number of nonzeros at the optimum: 5
  240. number of iterations required: 43
  241. etc.
  242. -----
  243. **References**
  244. Koh K, Kim S-J, Boyd S. (2007)
  245. An interior-point method for large-scale l1-regularized logistic regression.
  246. Journal of Machine Learning Research. 8:1519-1555.
  247. Shi W, Wahba G, Wright S, Lee K, Klein R, Klein B. (2008)
  248. LASSO-Patternsearch algorithm with application to ophthalmology and genomic data.
  249. Stat Interface. 1(1):137-153.
  250. <!--
  251. Wright S, Novak R, Figueiredo M. (2009)
  252. Sparse reconstruction via separable approximation.
  253. IEEE Transactions on Signal Processing. 57:2479-2403.
  254. Shi J, Yin W, Osher S, Sajda P. (2010)
  255. A fast hybrid algorithm for large scale l1-regularized logistic regression.
  256. Journal of Machine Learning Research. 11:713-741.
  257. Byrd R, Chin G, Neveitt W, Nocedal J. (2010)
  258. On the use of stochastic Hessian information in unconstrained optimization.
  259. Technical Report. Northwestern University. June 16, 2010.
  260. Wright S. (2010)
  261. Accelerated block-coordinate relaxation for regularized optimization.
  262. Technical Report. University of Wisconsin. August 10, 2010.
  263. -->
  264. </help>
  265. </tool>