PageRenderTime 33ms CodeModel.GetById 1ms app.highlight 22ms RepoModel.GetById 5ms app.codeStats 0ms

/ToMigrate/Raven.Database/Queries/MoreLikeThisQueryRunner.cs

Relevant Search: With Applications for Solr and Elasticsearch

'Chapter 4. Taming tokens'. If you want to know how to extract ideas rather than words this book is for you. Learn concepts of precision and recall, making trade-offs between them and controlling the specificity of matches. Amazon Affiliate Link
http://github.com/ayende/ravendb
C# | 259 lines | 209 code | 44 blank | 6 comment | 58 complexity | 38e0079da1055ecbba2cb6221219ad70 MD5 | raw file
  1//-----------------------------------------------------------------------
  2// <copyright file="SuggestionQueryRunner.cs" company="Hibernating Rhinos LTD">
  3//     Copyright (c) Hibernating Rhinos LTD. All rights reserved.
  4// </copyright>
  5//-----------------------------------------------------------------------
  6using System;
  7using System.Collections.Generic;
  8using System.Linq;
  9using System.Threading;
 10
 11using Lucene.Net.Documents;
 12using Lucene.Net.Index;
 13using Lucene.Net.Search;
 14using Raven.Abstractions.Data;
 15using Raven.Abstractions.Extensions;
 16using Raven.Abstractions.Linq;
 17using Raven.Abstractions.Util.Encryptors;
 18using Raven.Database.Bundles.MoreLikeThis;
 19using Raven.Database.Data;
 20using Raven.Database.Impl;
 21using Raven.Database.Indexing;
 22using Raven.Database.Linq;
 23using Raven.Json.Linq;
 24
 25using Index = Raven.Database.Indexing.Index;
 26using Sparrow;
 27
 28namespace Raven.Database.Queries
 29{
 30    public class MoreLikeThisQueryRunner
 31    {
 32        private readonly DocumentDatabase database;
 33
 34        private HashSet<string> idsToLoad;
 35
 36        private DocumentRetriever documentRetriever;
 37
 38        public MoreLikeThisQueryRunner(DocumentDatabase database)
 39        {
 40            this.database = database;
 41        }
 42
 43        public MoreLikeThisQueryResult ExecuteMoreLikeThisQuery(MoreLikeThisQuery query, int pageSize = 25)
 44        {
 45            if (query == null) throw new ArgumentNullException("query");
 46
 47            var index = database.IndexStorage.GetIndexInstance(query.IndexName);
 48            if (index == null)
 49                throw new InvalidOperationException("The index " + query.IndexName + " cannot be found");
 50
 51            if (string.IsNullOrEmpty(query.DocumentId) && query.MapGroupFields.Count == 0)
 52                throw new InvalidOperationException("The document id or map group fields are mandatory");
 53
 54            IndexSearcher searcher;
 55            using (database.IndexStorage.GetCurrentIndexSearcher(index.indexId, out searcher))
 56            {
 57                var documentQuery = new BooleanQuery();
 58
 59                if (string.IsNullOrEmpty(query.DocumentId) == false)
 60                {
 61                    documentQuery.Add(new TermQuery(new Term(Constants.DocumentIdFieldName, query.DocumentId.ToLowerInvariant())), Occur.MUST);
 62                }
 63
 64                foreach (string key in query.MapGroupFields.Keys)
 65                {
 66                    documentQuery.Add(new TermQuery(new Term(key, query.MapGroupFields[key])), Occur.MUST);
 67                }
 68
 69                var td = searcher.Search(documentQuery, 1);
 70
 71                // get the current Lucene docid for the given RavenDB doc ID
 72                if (td.ScoreDocs.Length == 0)
 73                    throw new InvalidOperationException("Document " + query.DocumentId + " could not be found");
 74
 75                var ir = searcher.IndexReader;
 76                var mlt = new RavenMoreLikeThis(ir);
 77
 78                AssignParameters(mlt, query);
 79
 80                if (string.IsNullOrWhiteSpace(query.StopWordsDocumentId) == false)
 81                {
 82                    var stopWordsDoc = database.Documents.Get(query.StopWordsDocumentId);
 83                    if (stopWordsDoc == null)
 84                        throw new InvalidOperationException("Stop words document " + query.StopWordsDocumentId + " could not be found");
 85
 86                    var stopWordsSetup = stopWordsDoc.DataAsJson.JsonDeserialization<StopWordsSetup>();
 87                    if (stopWordsSetup.StopWords != null)
 88                    {
 89                        var stopWords = stopWordsSetup.StopWords;
 90                        var ht = new HashSet<string>(StringComparer.InvariantCultureIgnoreCase);
 91                        foreach (var stopWord in stopWords)
 92                        {
 93                            ht.Add(stopWord);
 94                        }
 95                        mlt.SetStopWords(ht);
 96                    }
 97                }
 98
 99                var fieldNames = query.Fields ?? GetFieldNames(ir);
100                mlt.SetFieldNames(fieldNames);
101
102                var toDispose = new List<Action>();
103                RavenPerFieldAnalyzerWrapper perFieldAnalyzerWrapper = null;
104                try
105                {
106                    perFieldAnalyzerWrapper = index.CreateAnalyzer(new LowerCaseKeywordAnalyzer(), toDispose, true);
107                    mlt.Analyzer = perFieldAnalyzerWrapper;
108
109                    var mltQuery = mlt.Like(td.ScoreDocs[0].Doc);
110                    var tsdc = TopScoreDocCollector.Create(pageSize, true);
111
112
113                    if (string.IsNullOrWhiteSpace(query.AdditionalQuery) == false)
114                    {
115                        var additionalQuery = QueryBuilder.BuildQuery(query.AdditionalQuery, perFieldAnalyzerWrapper);
116                        mltQuery = new BooleanQuery
117                        {
118                            {mltQuery, Occur.MUST},
119                            {additionalQuery, Occur.MUST}, 
120                        };
121                    }
122
123                    searcher.Search(mltQuery, tsdc);
124                    var hits = tsdc.TopDocs().ScoreDocs;
125                    var jsonDocuments = GetJsonDocuments(query, searcher, index, query.IndexName, hits, td.ScoreDocs[0].Doc);
126
127                    var result = new MultiLoadResult();
128
129                    var includedEtags = new List<byte>(jsonDocuments.SelectMany(x => x.Etag.ToByteArray()));
130                    includedEtags.AddRange(database.Indexes.GetIndexEtag(query.IndexName, null).ToByteArray());
131                    var loadedIds = new HashSet<string>(jsonDocuments.Select(x => x.Key));
132                    var addIncludesCommand = new AddIncludesCommand(database, (etag, includedDoc) =>
133                    {
134                        includedEtags.AddRange(etag.ToByteArray());
135                        result.Includes.Add(includedDoc);
136                    }, query.Includes ?? new string[0], loadedIds);
137
138                    idsToLoad = new HashSet<string>();
139
140                    database.TransactionalStorage.Batch(actions =>
141                    {
142                        documentRetriever = new DocumentRetriever(database.Configuration, actions, database.ReadTriggers, query.TransformerParameters, idsToLoad);
143
144                        using (new CurrentTransformationScope(database, documentRetriever))
145                        {
146                            foreach (var document in ProcessResults(query, jsonDocuments, database.WorkContext.CancellationToken))
147                            {
148                                result.Results.Add(document);
149                                addIncludesCommand.Execute(document);
150                            }
151                        }
152                    });
153
154                    addIncludesCommand.AlsoInclude(idsToLoad);
155
156                    Etag computedEtag = Etag.FromHash(Hashing.Metro128.Calculate(includedEtags.ToArray()));
157
158                    return new MoreLikeThisQueryResult
159                    {
160                        Etag = computedEtag,
161                        Result = result,
162                    };
163                }
164                finally
165                {
166                    if (perFieldAnalyzerWrapper != null)
167                        perFieldAnalyzerWrapper.Close();
168                    foreach (var action in toDispose)
169                    {
170                        action();
171                    }
172                }
173            }
174        }
175
176        private IEnumerable<RavenJObject> ProcessResults(MoreLikeThisQuery query, IEnumerable<JsonDocument> documents, CancellationToken token)
177        {
178            IndexingFunc transformFunc = null;
179
180            if (string.IsNullOrEmpty(query.ResultsTransformer) == false)
181            {
182                var transformGenerator = database.IndexDefinitionStorage.GetTransformer(query.ResultsTransformer);
183
184                if (transformGenerator != null && transformGenerator.TransformResultsDefinition != null)
185                    transformFunc = transformGenerator.TransformResultsDefinition;
186                else
187                    throw new InvalidOperationException("The transformer " + query.ResultsTransformer + " was not found");
188            }
189
190            IEnumerable<RavenJObject> results;
191            var transformerErrors = new List<string>();
192
193            if (transformFunc == null)
194                results = documents.Select(x => x.ToJson());
195            else
196            {
197                var robustEnumerator = new RobustEnumerator(token, 100,
198                    onError: (exception, o) => transformerErrors.Add(string.Format("Doc '{0}', Error: {1}", Index.TryGetDocKey(o), exception.Message)));
199
200                results = robustEnumerator
201                    .RobustEnumeration(documents.Select(x => new DynamicJsonObject(x.ToJson())).GetEnumerator(), transformFunc)
202                    .Select(JsonExtensions.ToJObject);
203            }
204
205            return results;
206        }
207
208        private JsonDocument[] GetJsonDocuments(MoreLikeThisQuery parameters, IndexSearcher searcher, Index index, string indexName, IEnumerable<ScoreDoc> hits, int baseDocId)
209        {
210            if (string.IsNullOrEmpty(parameters.DocumentId) == false)
211            {
212                var documentIds = hits
213                    .Where(hit => hit.Doc != baseDocId)
214                    .Select(hit => searcher.Doc(hit.Doc).Get(Constants.DocumentIdFieldName))
215                    .Where(x => x != null)
216                    .Distinct();
217
218                return documentIds
219                    .Select(docId => database.Documents.Get(docId))
220                    .Where(it => it != null)
221                    .ToArray();
222            }
223
224            var fields = searcher.Doc(baseDocId).GetFields().Cast<AbstractField>().Select(x => x.Name).Distinct().ToArray();
225            var etag = database.Indexes.GetIndexEtag(indexName, null);
226            return hits
227                .Where(hit => hit.Doc != baseDocId)
228                .Select(hit => new JsonDocument
229                {
230                    DataAsJson = Index.CreateDocumentFromFields(searcher.Doc(hit.Doc),
231                                                                new FieldsToFetch(fields, false, index.IsMapReduce ? Constants.ReduceKeyFieldName : Constants.DocumentIdFieldName)),
232                    Etag = etag
233                })
234                .ToArray();
235        }
236
237        private static void AssignParameters(Lucene.Net.Search.Similar.MoreLikeThis mlt, MoreLikeThisQuery parameters)
238        {
239            if (parameters.Boost != null) mlt.Boost = parameters.Boost.Value;
240            if (parameters.BoostFactor != null) mlt.BoostFactor = parameters.BoostFactor.Value;
241            if (parameters.MaximumNumberOfTokensParsed != null) mlt.MaxNumTokensParsed = parameters.MaximumNumberOfTokensParsed.Value;
242            if (parameters.MaximumQueryTerms != null) mlt.MaxQueryTerms = parameters.MaximumQueryTerms.Value;
243            if (parameters.MinimumWordLength != null) mlt.MinWordLen = parameters.MinimumWordLength.Value;
244            if (parameters.MaximumWordLength != null) mlt.MaxWordLen = parameters.MaximumWordLength.Value;
245            if (parameters.MinimumTermFrequency != null) mlt.MinTermFreq = parameters.MinimumTermFrequency.Value;
246            if (parameters.MinimumDocumentFrequency != null) mlt.MinDocFreq = parameters.MinimumDocumentFrequency.Value;
247            if (parameters.MaximumDocumentFrequency != null) mlt.MaxDocFreq = parameters.MaximumDocumentFrequency.Value;
248            if (parameters.MaximumDocumentFrequencyPercentage != null) mlt.SetMaxDocFreqPct(parameters.MaximumDocumentFrequencyPercentage.Value);
249        }
250
251        private static string[] GetFieldNames(IndexReader indexReader)
252        {
253            var fields = indexReader.GetFieldNames(IndexReader.FieldOption.INDEXED);
254            return fields
255                .Where(x => x != Constants.DocumentIdFieldName && x != Constants.ReduceKeyFieldName)
256                .ToArray();
257        }
258    }
259}