PageRenderTime 174ms CodeModel.GetById 8ms RepoModel.GetById 0ms app.codeStats 0ms

/src/shogun/classifier/vw/VwParser.h

https://code.google.com/
C Header | 270 lines | 91 code | 27 blank | 152 comment | 8 complexity | 9c630396b6f96de614c7c7b9dfac9589 MD5 | raw file
Possible License(s): GPL-2.0, GPL-3.0, BSD-3-Clause
  1. /*
  2. * Copyright (c) 2009 Yahoo! Inc. All rights reserved. The copyrights
  3. * embodied in the content of this file are licensed under the BSD
  4. * (revised) open source license.
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * Written (W) 2011 Shashwat Lal Das
  12. * Adaptation of Vowpal Wabbit v5.1.
  13. * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society.
  14. */
  15. #ifndef _VW_PARSER_H__
  16. #define _VW_PARSER_H__
  17. #include <shogun/base/SGObject.h>
  18. #include <shogun/io/SGIO.h>
  19. #include <shogun/lib/Hash.h>
  20. #include <shogun/classifier/vw/vw_common.h>
  21. #include <shogun/classifier/vw/cache/VwCacheWriter.h>
  22. namespace shogun
  23. {
  24. /// The type of input to parse
  25. enum E_VW_PARSER_TYPE
  26. {
  27. T_VW = 1,
  28. T_SVMLIGHT = 2,
  29. T_DENSE = 3
  30. };
  31. /** @brief CVwParser is the object which provides the
  32. * functions to parse examples from buffered input.
  33. *
  34. * An instance of this class can be created in
  35. * CStreamingVwFile and the appropriate read_*_features
  36. * function called to parse examples from different formats.
  37. *
  38. * It also encapsulates a CVwCacheWriter object which may
  39. * be used in case a cache file is to be generated simultaneously
  40. * with parsing.
  41. */
  42. class CVwParser: public CSGObject
  43. {
  44. public:
  45. /**
  46. * Default constructor
  47. */
  48. CVwParser();
  49. /**
  50. * Constructor taking environment as parameter.
  51. *
  52. * @param env_to_use CVwEnvironment to use
  53. */
  54. CVwParser(CVwEnvironment* env_to_use);
  55. /**
  56. * Destructor
  57. */
  58. virtual ~CVwParser();
  59. /**
  60. * Get the environment
  61. *
  62. * @return environment as CVwEnvironment*
  63. */
  64. CVwEnvironment* get_env()
  65. {
  66. SG_REF(env);
  67. return env;
  68. }
  69. /**
  70. * Set the environment
  71. *
  72. * @param env_to_use environment as CVwEnvironment*
  73. */
  74. void set_env(CVwEnvironment* env_to_use)
  75. {
  76. env = env_to_use;
  77. SG_REF(env);
  78. }
  79. /**
  80. * Set the cache parameters
  81. *
  82. * @param fname name of the cache file
  83. * @param type type of cache as one in EVwCacheType
  84. */
  85. void set_cache_parameters(char * fname, EVwCacheType type = C_NATIVE)
  86. {
  87. init_cache(fname, type);
  88. }
  89. /**
  90. * Return the type of cache
  91. *
  92. * @return cache type as EVwCacheType
  93. */
  94. EVwCacheType get_cache_type()
  95. {
  96. return cache_type;
  97. }
  98. /**
  99. * Set whether to write cache file or not
  100. *
  101. * @param wr_cache write cache or not
  102. */
  103. void set_write_cache(bool wr_cache)
  104. {
  105. write_cache = wr_cache;
  106. if (wr_cache)
  107. init_cache(NULL);
  108. else
  109. if (cache_writer)
  110. SG_UNREF(cache_writer);
  111. }
  112. /**
  113. * Return whether cache will be written or not
  114. *
  115. * @return will cache be written?
  116. */
  117. bool get_write_cache()
  118. {
  119. return write_cache;
  120. }
  121. /**
  122. * Update min and max labels seen in the environment
  123. *
  124. * @param label current label based on which to update
  125. */
  126. void set_mm(float64_t label)
  127. {
  128. env->min_label = CMath::min(env->min_label, label);
  129. if (label != FLT_MAX)
  130. env->max_label = CMath::max(env->max_label, label);
  131. }
  132. /**
  133. * A dummy function performing no operation in case training
  134. * is not to be performed.
  135. *
  136. * @param label label
  137. */
  138. void noop_mm(float64_t label) { }
  139. /**
  140. * Function which is actually called to update min and max labels
  141. * Should be set to one of the functions implemented for this.
  142. *
  143. * @param label label based on which to update
  144. */
  145. void set_minmax(float64_t label)
  146. {
  147. set_mm(label);
  148. }
  149. /**
  150. * Reads input from the buffer and parses it into a VwExample
  151. *
  152. * @param buf IOBuffer which contains input
  153. * @param ex parsed example
  154. *
  155. * @return number of characters read for this example
  156. */
  157. int32_t read_features(CIOBuffer* buf, VwExample*& ex);
  158. /**
  159. * Read an example from an SVMLight file
  160. *
  161. * @param buf IOBuffer which contains input
  162. * @param ae parsed example
  163. *
  164. * @return number of characters read for this example
  165. */
  166. int32_t read_svmlight_features(CIOBuffer* buf, VwExample*& ae);
  167. /**
  168. * Read an example from a file with dense vectors
  169. *
  170. * @param buf IOBuffer which contains input
  171. * @param ae parsed example
  172. *
  173. * @return number of characters read for this example
  174. */
  175. int32_t read_dense_features(CIOBuffer* buf, VwExample*& ae);
  176. /**
  177. * Return the name of the object
  178. *
  179. * @return VwParser
  180. */
  181. virtual const char* get_name() const { return "VwParser"; }
  182. protected:
  183. /**
  184. * Initialize the cache writer
  185. *
  186. * @param fname cache file name
  187. * @param type cache type as EVwCacheType, default is C_NATIVE
  188. */
  189. void init_cache(char * fname, EVwCacheType type = C_NATIVE);
  190. /**
  191. * Get value of feature from a given substring.
  192. * A default of 1 is assumed if no explicit value is specified.
  193. *
  194. * @param s substring, usually a feature:value string
  195. * @param name returned array of substrings, split into name and value
  196. * @param v value of feature, set by reference
  197. */
  198. void feature_value(substring &s, v_array<substring>& name, float32_t &v);
  199. /**
  200. * Split a given substring into an array of substrings
  201. * based on a specified delimiter
  202. *
  203. * @param delim delimiter to use
  204. * @param s substring to tokenize
  205. * @param ret array of substrings, returned
  206. */
  207. void tokenize(char delim, substring s, v_array<substring> &ret);
  208. /**
  209. * Get the index of a character in a memory location
  210. * taking care not to go beyond the max pointer.
  211. *
  212. * @param start start memory location, char*
  213. * @param v character to search for
  214. * @param max last location to look in
  215. *
  216. * @return index of found location as char*
  217. */
  218. inline char* safe_index(char *start, char v, char *max)
  219. {
  220. while (start != max && *start != v)
  221. start++;
  222. return start;
  223. }
  224. public:
  225. /// Hash function to use, of type hash_func_t
  226. hash_func_t hasher;
  227. protected:
  228. /// Environment of VW - used by parser
  229. CVwEnvironment* env;
  230. /// Object which will be used for writing cache
  231. CVwCacheWriter* cache_writer;
  232. /// Type of cache
  233. EVwCacheType cache_type;
  234. /// Whether to write cache or not
  235. bool write_cache;
  236. private:
  237. /// Used during parsing
  238. v_array<substring> channels;
  239. v_array<substring> words;
  240. v_array<substring> name;
  241. };
  242. }
  243. #endif // _VW_PARSER_H__