PageRenderTime 66ms CodeModel.GetById 53ms app.highlight 7ms RepoModel.GetById 2ms app.codeStats 0ms

/CatRec.h

https://github.com/gigablast/open-source-search-engine
C Header | 527 lines | 44 code | 55 blank | 428 comment | 0 complexity | a25b7947fffbe67a17968015e8abefa0 MD5 | raw file
Possible License(s): Apache-2.0
  1// Matt Wells, copyright Jul 201
  2
  3// . the record retrieved from tagdb
  4// . used for describing a site
  5// . can parse out record from our rdb or from a network msg
  6// . has siteUrl and filenum of the file that holds the Xml that has the
  7//   parsing rules and quotas for docs in that site
  8// . we have the fields you can use at the bottom of this file
  9
 10#ifndef _CATREC_H_
 11#define _CATREC_H_
 12
 13#include "Conf.h"
 14#include "Xml.h"
 15#include "RdbList.h"
 16#include "Tagdb.h"
 17#include "Categories.h"
 18#include "Lang.h"
 19#include "Tagdb.h"
 20#include "Catdb.h"
 21
 22#define MAX_IND_CATIDS 1024
 23#define MAX_SITE_TYPES 12
 24// url, catids, indirect catids, numCatids, numIndCatids, filenum
 25#define CATREC_BUF_SIZE MAX_URL_LEN + MAX_CATIDS*4 + 9
 26
 27class CatRec {
 28
 29 public:
 30
 31	// these just set m_xml to NULL
 32	void reset() ;
 33	CatRec();
 34	~CatRec();
 35
 36	// . extract the site url for "url"
 37	// . extract the filenum of the file that holds the xml we want
 38	// . returns false and sets errno on error setting
 39	// . if rec is NULL we use the default rec for this collection
 40	bool set ( Url *url, char *data,int32_t dataSize,
 41		   bool gotByIp ); // , char rdbId = RDB_TAGDB );
 42
 43	// we're empty if m_xml is NULL
 44	//bool isEmpty() { return (! m_xml); };
 45
 46	// . used to by Msg9 to make a CatRec to add
 47	// . serializes filenum/site into our m_data/m_dataSize
 48	// . returns false and sets errno on error
 49	/*
 50	bool set ( Url *site , char *coll , int32_t collLen , int32_t filenum ,
 51		   char  version , char rdbId = RDB_TAGDB , int32_t timeStamp = 0,
 52		   char *comment = NULL, char *username = NULL,
 53		   int32_t *catids = NULL, unsigned char numCatids = 0, 
 54		   unsigned char spamBits = 0, char siteQuality = 0, 
 55		   char adultLevel = 0, 
 56		   SiteType *siteTypes = NULL, 
 57		   uint8_t numTypes = 0,
 58 		   SiteType *langs = NULL, 
 59 		   uint8_t numLangs = 0); 
 60	*/
 61	bool set ( Url *site , int32_t filenum ,
 62		   int32_t *catids = NULL, unsigned char numCatids = 0 );
 63
 64	//Xml *getXml() { return m_xml; };
 65
 66	//bool set ( int32_t filenum ) ;
 67
 68	//  . this method just sets the filenum, version, url and url-len from
 69	//  data-pointer "data"
 70	//  . this method is written as an alternative to the above set methods
 71	//  Useful if the caller is interested just in the url and url len
 72	//  saves time
 73	bool set (char *data, int32_t dataSize);//, char rdbId );
 74
 75	// set the indirect catids
 76	void setIndirectCatids ( int32_t *indCatids, int32_t numIndCatids );
 77
 78	// . did this url have an entry in tagdb?
 79	// . we need this to know because if it didn't it will have default rec
 80	// . Msg16 will override Url::isSpam() if this record is not default
 81	// . Msg25 will also not bother checking for link bans via Msg18
 82	bool hadRec() { return m_hadRec; };
 83
 84	// . did we get it by ip? (if not, we got it by canonical domain name)
 85	// . if we got it by IP and it was banned, admin has the option to
 86	//   tell gigablast to automatically add the domain name as banned
 87	//   to tagdb in Msg14.cpp
 88	bool gotByIp() { return m_gotByIp; };
 89
 90	// get the record itself (just templateNum/site/coll)
 91	char *getData     ( ) { return m_data; };
 92	int32_t  getDataSize ( ) { return m_dataSize; };
 93
 94	// along with coll/collLen identifies a unique xml file
 95	//int32_t  getFilenum ( ) { return m_filenum; };
 96	//int32_t  getRuleset ( ) { return m_filenum; };
 97	
 98
 99	// . these should both be NULL terminated
100	// . they both reference into the data contained in m_list
101	//   or m_buf if the list doesn't have a site record for us
102	Url  *getSite          ( ) { return &m_site; };
103	//char *getCollection    ( ) { return  m_coll; };
104	//int32_t  getCollectionLen ( ) { return  m_collLen; };
105
106	/*
107	char* printFormattedRec(char* p);
108	void  printFormattedRec(SafeBuf *sb);
109	char* printXmlRec      (char* p);
110        void  printXmlRec      ( SafeBuf *sb );
111
112	//status of manually set bits.
113	bool isSpamUnknown() { return m_spamBits == SPAM_UNKNOWN; }
114	bool isSpam()        { return m_spamBits == SPAM_BIT;     }
115	bool isNotSpam()     { return m_spamBits == NOT_SPAM;     }
116	char* getSpamStr();
117	unsigned char getSpamStatus() { return m_spamBits; }
118
119	//
120	bool isRatingUnknown()      { return m_adultLevel == NOT_RATED; }
121	bool isAdultButNotPorn()    { return m_adultLevel == RATED_R;   }
122	bool isPorn()               { return m_adultLevel == RATED_X;   }
123	bool isKidSafe()            { return m_adultLevel == RATED_G;   }
124	char* getAdultStr();
125
126	char *getPubDateFmtStr();
127
128	int32_t          getTimeStamp()   { return m_timeStamp; }
129	char         *getComment()     { return m_comment; }
130	char         *getUsername()    { return m_username; }
131	char          getSiteQuality() { return m_siteQuality; }
132	int32_t          getNumSiteTypes  () { return m_numTypes; }
133	int32_t          getNumSiteLangs  () { return m_numLangs; }
134	SiteType     *getSiteTypes  () { return m_siteTypes; }
135	SiteType     *getSiteLangs  () { return m_siteLangs; }
136	uint32_t      getScoreForType(uint8_t type);
137
138	// . mod functions
139	// . pain in the butt cuz we gotta change m_data/m_dataSize buffer too
140	void          addSiteType    (uint8_t type, uint32_t score ) ;
141	void          setFilenum     (int32_t newFilenum );
142
143	// . [n0,n1] constitute an xml node range in "xml"
144	// . "len" is the length of another node's data in another xml doc
145	// . gets the scoreWeight from docQuality and a node's dataLen
146	// . 2nd one gets the maxScore from docQuality
147	int32_t getScoreWeightFromQuality ( int32_t n0, int32_t n1, int32_t quality );
148	int32_t getScoreWeightFromQuality2( int32_t quality );
149	int32_t getMaxScoreFromQuality    ( int32_t n0, int32_t n1, int32_t quality );
150	int32_t getMaxLenFromQuality      ( int32_t n0, int32_t n1, int32_t quality );
151
152	//bool hasMaxCountFromQualityTag ( int32_t n0, int32_t n1 ) ;
153	//int32_t getMaxCountFromQuality    ( int32_t n0, int32_t n1, int32_t quality ) ;
154
155	int32_t getScoreWeightFromLen     ( int32_t n0, int32_t n1, int32_t len );
156	int32_t getScoreWeightFromLen2    ( int32_t len );
157	int32_t getScoreWeightFromNumWords( int32_t n0, int32_t n1, int32_t len );
158	int32_t getMaxScoreFromLen        ( int32_t n0, int32_t n1, int32_t quality );
159	int32_t getMaxScoreFromNumWords   ( int32_t n0, int32_t n1, int32_t quality );
160
161	// 2 new maps for boosting base quality from link statistics
162	int32_t getQualityBoostFromNumLinks       ( int32_t numLinks );
163	int32_t getQualityBoostFromLinkQualitySum ( int32_t linkBaseQualitySum );
164
165	// 2 new maps for maxScore/scoreWeight of outgoing linkText
166	int32_t getLinkTextScoreWeightFromLinkerQuality ( int32_t quality );
167	int32_t getLinkTextScoreWeightFromLinkeeQuality ( int32_t quality );
168	int32_t getLinkTextMaxScoreFromQuality    ( int32_t quality );
169	int32_t getLinkTextScoreWeightFromNumWords( int32_t numWords );
170
171
172	// . another new map for boosting quality from the link-adjusted 
173	//   quality of our root page
174	// . root page is just our site url (i.e. http://about.com/)
175	// . "rootQuality" is link-adjusted
176	int32_t getQualityBoostFromRootQuality ( int32_t rootQuality ) ;
177
178	int32_t getQuotaBoostFromRootQuality ( int32_t rootQuality ) ;
179	int32_t getQuotaBoostFromQuality     ( int32_t quality     ) ;
180
181	// if X% of the words are spammed, consider ALL the words to be spammed
182	int32_t getMaxPercentForSpamFromQuality ( int32_t quality ) ;
183
184//private:
185
186	// . parses and accesses a map/graph in the xml for us
187	// . returns default "def" if map not present or x's in map unordered
188	int32_t getY (int32_t n0,int32_t n1,int32_t X,char *strx,char *stry,int32_t def) ;
189	*/
190
191	// these reference into m_data???
192	Url     m_site;
193	//char    m_coll[64];
194	//int32_t    m_collLen;
195
196	// filenum determines the xml uniquely
197	int32_t    m_filenum;
198
199	// did this rec have it's own entry in tagdb?
200	bool    m_hadRec;
201	// did we get it by ip? (if not, we got it by canonical domain name)
202	bool    m_gotByIp;
203
204	/*
205	// . the xml describing this site
206	// . references into an Xml stored in Sitedb class
207	Xml    *m_xml;
208	*/
209
210	// a buffer for holding the little site record itself
211	char    m_data[CATREC_BUF_SIZE];
212	int32_t    m_dataSize;
213
214	// category ID info for catdb
215	unsigned char  m_numCatids;
216	int32_t          *m_catids;
217	int32_t           m_numIndCatids;
218	int32_t           m_indCatids[MAX_IND_CATIDS];
219
220	// version
221	unsigned char m_version;
222	/*
223
224	
225	unsigned char m_spamBits;
226	unsigned char m_adultLevel;
227	char          m_siteQuality;
228	
229	uint8_t m_numTypes;
230	uint8_t m_numLangs;
231	SiteType m_siteTypes[MAX_SITE_TYPES];
232	SiteType m_siteLangs[MAX_SITE_TYPES];
233	*/
234
235	// url pointer
236	char   *m_url;
237	int32_t    m_urlLen;
238
239	/*
240	// time stamp, comment, username
241	int32_t    m_timeStamp;
242	char   *m_comment;
243	char   *m_username;
244
245	// hack for addSiteType()
246	int32_t   *m_incHere;
247	char   *m_addHere ;
248	// hack for changeFilenum()
249	char   *m_filenumPtr;
250	*/
251};
252
253#endif
254
255// format of a template or default record in xml:
256
257// ## NOTE: the key of the record is the sitename prefixed with the collection:
258// ## NOTE: "collectionName:" is prefixed to all hashed terms before hashing
259// ## LATER: do permission system
260
261// ## all indexed terms will be preceeded by "collection:" when indexed so you
262// ## can do a search within that collection.
263// <comment>                  %s  </> 
264// ## <addedDate>                %s  </> (stored as a int32_t)
265// <allowMimeType>            %s  </> (text, html?) 
266// <allowExtension>           %s  </> (used iff allowAllExtensions is false)
267
268// ## the base quality of all docs from this site
269// <baseQuality>              %c  </> (0-100%,default 30,qual of docs in site)
270
271// ## the computed link-adjusted quality should not exceed this
272// <maxQuality>               %c  </> (0-100%, def 100)
273
274// ## should we treat incoming link text as if it were on our page?
275// ## score weights and maxes for the link text is determined by the linker's
276// ## own link-adjusted quality. (see graphs/maps below)
277// <indexIncomingLinkText>    %b  </> (0-100, default = 100, a %)
278
279// ## do links from this site always point to clean pages?
280// <linksClean>               %b  </> (default no)
281
282// ## a doc w/ link-adjusted quality LESS THAN this will not be indexed
283// <minQualityToIndex>        %c  </> (default 0%  )  
284
285// ## a doc w/ link-adjusted quality at or below this will be checked for
286// ## adult content.
287// <maxQualityForAdultDetect> %c  </> (default 0%, 0 means none)
288
289// ## how often do we re-spider it?
290// ## we try to compute the best spider rate based on last modified times
291// <minSpiderFrequency>       %i  </> (default 60*60*24*30=1month, in seconds)
292// <maxSpiderFrequency>       %i  </> (default 60*60*24*30=1month, in seconds)
293// <spiderLinks>              %b  </> (default true)
294// <spiderLinkPriority>       %"INT32" </> (0-7, default -1) -1 means prntPriorty-1
295// <spiderMaxPriority>        %"INT32" </> (0-7, default 7) 
296
297
298// ## these are fairly self-explanatory
299// <maxUrlLen>                %i  </> (default 0, 0 means none)
300// <minMetaRefresh>           %i  </> (default 6  )
301// <isBanned>                 %b  </> (default no ) 
302// <isAdult>                  %b  </> (default no ) 
303// <isISP>                    %b  </> (default no ) 
304// <isTrusted>                %b  </> (default no ) 
305// <allowAdultContent>        %b  </> (default yes)
306// <allowCgiUrls>             %b  </> (default yes)
307// <allowIpUrls>              %b  </> (default yes)
308// <allowAllExtensions>       %b  </> (default yes)
309// <allowNonAsciiDocs>        %b  </> (default yes)
310// <delete404s>               %b  </> (default yes) from cache/titledb
311// <indexDupContent>          %b  </> (default yes)
312// <indexSite>                %b  </> (default yes) site:    terms 
313// <indexSubSite>             %b  </> (default yes) subsite: terms 
314// <indexUrl>                 %b  </> (default yes) url:     terms
315// <indexSubUrl>              %b  </> (default yes) suburl:  terms
316// <indexIp>                  %b  </> (default yes) ip:      terms
317// <indexLinks>               %b  </> (default yes) link:/href: terms
318
319// <maxDocs>                  %ul </> (default -1 = no max)
320
321// ## we don't have a security system... yet...
322// ## TODO: <maxCacheSpace>        %ul </> (default 1024*1024)
323// ## TODO: <directorMaxScore>     %s  </> (256bit seal for maxScore tag above)
324
325// ## Now for some maps/graphs.
326// ## we list the 5 X components followed by the 5 Y components.
327// ## all maps/graphs linearly interpolate between the points.
328// ## the edge pieces are horizontal.
329// ## these maps can have up to 32 points but i typically just use 5.
330
331// ## we map the NUMBER of incoming links to a baseQuality BOOST for our doc.
332// ## the resulting new quality is the link-adjusted quality of the linkee doc.
333// ## These boosts are ADDED to the existing quality.
334// <numLinks11>                %i  </> (default 0   ) 
335// <numLinks12>                %i  </> (default 5   )
336// <numLinks13>                %i  </> (default 10  )
337// <numLinks14>                %i  </> (default 20  )
338// <numLinks15>                %i  </> (default 50  )
339// <qualityBoost11>            %i  </> (default  0% )
340// <qualityBoost12>            %i  </> (default  5% )
341// <qualityBoost13>            %i  </> (default 10% )
342// <qualityBoost14>            %i  </> (default 15% )
343// <qualityBoost15>            %i  </> (default 20% )
344
345// ## we map the SUM of the baseQuality of all linkers to a baseQuality BOOST.
346// ## the resulting new quality is the link-adjusted quality of the linkee doc.
347// ## we only add up BASE quality of the linkers.
348// ## we only add up 1 linker's BASE quality per site.
349// ## These boosts are ADDED to the existing quality.
350// <linkQualitySum21>          %i  </> (default 0   )
351// <linkQualitySum22>          %i  </> (default 50  )
352// <linkQualitySum23>          %i  </> (default 100 )
353// <linkQualitySum24>          %i  </> (default 150 )
354// <linkQualitySum25>          %i  </> (default 200 )
355// <qualityBoost21>            %i  </> (default  0% )
356// <qualityBoost22>            %i  </> (default  5% )
357// <qualityBoost23>            %i  </> (default 10% )
358// <qualityBoost24>            %i  </> (default 15% )
359// <qualityBoost25>            %i  </> (default 20% )
360
361// ## we map the LINK-ADJUSTED QUALITY of our root page (site url) to a
362// ## quality BOOST for us.
363// ## the site url is just our site, could be like http://about.com/
364// ## These boosts are ADDED to the existing quality.
365// <rootQuality31>             %i  </> (default 0   ) 
366// <rootQuality32>             %i  </> (default 50  )
367// <rootQuality33>             %i  </> (default 100 )
368// <rootQuality34>             %i  </> (default 200 )
369// <rootQuality35>             %i  </> (default 500 )
370// <qualityBoost31>            %i  </> (default  0% )
371// <qualityBoost32>            %i  </> (default  5% )
372// <qualityBoost33>            %i  </> (default 10% )
373// <qualityBoost34>            %i  </> (default 15% )
374// <qualityBoost35>            %i  </> (default 20% )
375
376// ## TODO: make based on quality of doc and length of link text!!
377// ## currently we limit link text to up to 256 chars in LinkInfo.cpp.
378// ## map doc's link-adjusted quality to scoreWeight of it's outgoing link text
379// <quality41>                 %i  </> (default   0% )
380// <quality42>                 %i  </> (default  30% )
381// <quality43>                 %i  </> (default  50% )
382// <quality44>                 %i  </> (default  70% )
383// <quality45>                 %i  </> (default  85% )
384// <linkTextScoreWeight41>     %i  </> (default  50% )
385// <linkTextScoreWeight42>     %i  </> (default 100% )
386// <linkTextScoreWeight43>     %i  </> (default 130% )
387// <linkTextScoreWeight44>     %i  </> (default 180% )
388// <linkTextScoreWeight45>     %i  </> (default 250% )
389
390// ## map doc's link-adjusted quality to maxScore of it's outgoing link text.
391// ## maxScore applies to all docs from this site as to limit a site's impact.
392// <quality51>                 %i  </> (default 
393// <quality52>                 %i  </>
394// <quality53>                 %i  </>
395// <quality54>                 %i  </>
396// <quality55>                 %i  </>
397// <linkTextMaxScore51>        %i  </>
398// <linkTextMaxScore52>        %i  </>
399// <linkTextMaxScore53>        %i  </>
400// <linkTextMaxScore54>        %i  </>
401// <linkTextMaxScore55>        %i  </>
402
403// ## we map the LINK-ADJUSTED QUALITY of our ROOT page (site url) to a quota
404// ## boost. (can be negative)
405// ## the site url is just our site, could be like http://about.com/
406// ## These boosts are MULTIPLIED by the existing quota.
407// <rootQuality71>             %i  </> (default 0   ) 
408// <rootQuality72>             %i  </> (default 50  )
409// <rootQuality73>             %i  </> (default 100 )
410// <rootQuality74>             %i  </> (default 200 )
411// <rootQuality75>             %i  </> (default 500 )
412// <quotaBoost71>              %i  </> (default  0% )
413// <quotaBoost72>              %i  </> (default  0% )
414// <quotaBoost73>              %i  </> (default  0% )
415// <quotaBoost74>              %i  </> (default  0% )
416// <quotaBoost75>              %i  </> (default  0% )
417
418// ## we map the LINK-ADJUSTED QUALITY of our page (site url) to a quota
419// ## boost. (can be negative)
420// ## the site url is just our site, could be like http://about.com/
421// ## These boosts are MULTIPLIED by the existing quota.
422// <quality81>                 %i  </> (default 0   ) 
423// <quality82>                 %i  </> (default 50  )
424// <quality83>                 %i  </> (default 100 )
425// <quality84>                 %i  </> (default 200 )
426// <quality85>                 %i  </> (default 500 )
427// <quotaBoost81>              %i  </> (default  0% )
428// <quotaBoost82>              %i  </> (default  0% )
429// <quotaBoost83>              %i  </> (default  0% )
430// <quotaBoost84>              %i  </> (default  0% )
431// <quotaBoost85>              %i  </> (default  0% )
432
433// ## the <index> node describes parsing/indexing rtu
434// ## used for xhtml tags (title, meta summary/keywords/description)
435// ## NOTE: <score2> <weight2> defines a point on the #words-to-score function
436// ## NOTE: omit <name> to index whole body (exculdes meta tags and xml tags)
437// ## NOTE: set  <name> to "meta.summary" for indexing meta tag summary
438// ## NOTE: set  <name> to "meta.keywords" for indexing meta tag keywords
439// ## NOTE: set  <name> to "meta.description" for indexing meta tag keywords
440// ## NOTE: set  <name> to "Xml" for indexing ALL xml tags
441// ## NOTE: set  <name> to ??? for indexing text under that tag <???>...</>
442//  <index>
443//    <name>                     %s      </> ("title","meta.summary","Xml","W")
444//    <indexAsName>              %s      </> (for mapping pure xml tags)
445//    <prefix>                   %s      </> (like "title", "myTag:" -can omit)
446//    <maxQualityForSpamDetect>  %c      </> (default 0, 0 means none)
447//    <minQualityToIndex>        %ul     </> (0-255, default 0  ) do not index
448//    <minDepth>                 %ul     </> (0-inf, default 0  )
449//    <maxDepth>                 %ul     </> (0-inf, default inf)
450//    <maxLenToIndex>            %ul     </> (0-inf, default inf)
451//    <indexAllOccurences>       %b      </> (default no) (ex.: no for title)
452//    <indexCRC>                 %b      </> (default no ) index checksum?
453//    <filterHtmlEntities>       %b      </> (default yes)
454//    <indexIfUniqueOnly>        %b      </> (default no ) hash word iff unique
455//    <indexSingletons>          %b      </> (default yes)
456//    <indexPhrases>             %b      </> (default yes)
457//    <indexAsWhole>             %b      </> (default no ) hash a checksum
458//    <useStopWords>             %b      </> (default yes)
459//    <useStems>                 %b      </> (default yes)
460//
461//    ## Map doc's (link-adjusted) quality to a maxLen for this field.
462//    ## 30% quality is probably average.
463//    ## NOTE: there really are no defaults for these, use tagdb default rec.
464//    <quality11>                 %c  </> (default 15% )
465//    <quality12>                 %c  </> (default 30% )
466//    <quality13>                 %c  </> (default 45% )
467//    <quality14>                 %c  </> (default 60% )
468//    <quality15>                 %c  </> (default 80% )
469//    <maxLen11>                  %ul </> (default 80k )
470//    <maxLen12>                  %ul </> (default 100k)
471//    <maxLen13>                  %ul </> (default 150k)
472//    <maxLen14>                  %ul </> (default 200k)
473//    <maxLen15>                  %ul </> (default 250k)
474//
475//    ## Map doc's (link-adjusted) quality to a maxScore for this field.
476//    <quality21>                 %c  </> (default 15% )
477//    <quality22>                 %c  </> (default 30% )
478//    <quality23>                 %c  </> (default 45% )
479//    <quality24>                 %c  </> (default 60% )
480//    <quality25>                 %c  </> (default 80% )
481//    <maxScore21>                %ul </> (default 30% )
482//    <maxScore22>                %ul </> (default 45% )
483//    <maxScore23>                %ul </> (default 60% )
484//    <maxScore24>                %ul </> (default 80% )
485//    <maxScore25>                %ul </> (default 100%)
486//
487//    ## map doc (link-adjusted) quality to a scoreWeight for this field
488//    <quality31>                 %c  </> (default 15% )
489//    <quality32>                 %c  </> (default 30% )
490//    <quality33>                 %c  </> (default 45% )
491//    <quality34>                 %c  </> (default 60% )
492//    <quality35>                 %c  </> (default 80% )
493//    <scoreWeight31>             %ul </> (default 60% )
494//    <scoreWeight32>             %ul </> (default 100%)
495//    <scoreWeight33>             %ul </> (default 150%)
496//    <scoreWeight34>             %ul </> (default 200%)
497//    <scoreWeight35>             %ul </> (default 250%)
498//
499//    ## map field length to a scoreWeight for this field
500//    <len41>                    %ul  </> (default 100) #w<100 -->wght=300
501//    <len42>                    %ul  </> (default 500) score in[200,300]
502//    <len43>                    %ul  </> (default 1000)
503//    <len44>                    %ul  </> (default 2000)
504//    <len45>                    %ul  </> (default 5000) if under/over 5000
505//    <scoreWeight41>            %ul  </> (default 300%) 
506//    <scoreWeight42>            %ul  </> (default 200%) 
507//    <scoreWeight43>            %ul  </> (default 150%) 
508//    <scoreWeight44>            %ul  </> (default 100%) 
509//    <scoreWeight45>            %ul  </> (default  50%) 
510//
511//    ## map field length to a maxScore for this field
512//    <len51>                    %ul  </> (default 100) #w<100 -->wght=300
513//    <len52>                    %ul  </> (default 500) score in[200,300]
514//    <len53>                    %ul  </> (default 1000)
515//    <len54>                    %ul  </> (default 2000)
516//    <len55>                    %ul  </> (default 5000) if under/over 5000
517//    <maxScore51>               %ul  </> (default 30% )
518//    <maxScore52>               %ul  </> (default 45% )
519//    <maxScore53>               %ul  </> (default 60% )
520//    <maxScore54>               %ul  </> (default 80% )
521//    <maxScore55>               %ul  </> (default 100%)
522//
523//  </>
524
525// TODO:
526// <indexAsLong>, <indexAsBool>, ... for pure xml tags w/ special meaning
527//