PageRenderTime 40ms CodeModel.GetById 3ms app.highlight 30ms RepoModel.GetById 1ms app.codeStats 0ms

/ToMigrate/Raven.Database/Indexing/MapReduceIndex.cs

Relevant Search: With Applications for Solr and Elasticsearch

'Chapter 4. Taming tokens'. If you want to know how to extract ideas rather than words this book is for you. Learn concepts of precision and recall, making trade-offs between them and controlling the specificity of matches. Amazon Affiliate Link
http://github.com/ayende/ravendb
C# | 887 lines | 743 code | 128 blank | 16 comment | 77 complexity | b6d38a3b52086945c3e703a8d2d97936 MD5 | raw file
  1//-----------------------------------------------------------------------
  2// <copyright file="MapReduceIndex.cs" company="Hibernating Rhinos LTD">
  3//     Copyright (c) Hibernating Rhinos LTD. All rights reserved.
  4// </copyright>
  5//-----------------------------------------------------------------------
  6using System;
  7using System.Collections;
  8using System.Collections.Concurrent;
  9using System.Collections.Generic;
 10using System.ComponentModel;
 11using System.Diagnostics;
 12using System.Globalization;
 13using System.Linq;
 14using System.Text;
 15using System.Threading;
 16using Lucene.Net.Analysis;
 17using Lucene.Net.Documents;
 18using Lucene.Net.Index;
 19using Lucene.Net.Search;
 20using Lucene.Net.Store;
 21using Raven.Abstractions.Logging;
 22using Raven.Database.Extensions;
 23using Raven.Database.Plugins;
 24using Raven.Database.Util;
 25using Raven.Imports.Newtonsoft.Json;
 26using Raven.Abstractions;
 27using Raven.Abstractions.Extensions;
 28using Raven.Abstractions.Data;
 29using Raven.Abstractions.Indexing;
 30using Raven.Abstractions.Linq;
 31using Raven.Database.Data;
 32using Raven.Database.Linq;
 33using Raven.Database.Storage;
 34using Raven.Imports.Newtonsoft.Json.Linq;
 35using Raven.Json.Linq;
 36using Spatial4n.Core.Exceptions;
 37using Sparrow.Collections;
 38
 39namespace Raven.Database.Indexing
 40{
 41    internal class MapReduceIndex : Index
 42    {
 43        readonly JsonSerializer jsonSerializer;
 44
 45        private static readonly JsonConverterCollection MapReduceConverters;
 46
 47        static MapReduceIndex()
 48        {
 49            MapReduceConverters = new JsonConverterCollection(Default.Converters)
 50            {
 51                new IgnoreFieldable()
 52            };
 53
 54            MapReduceConverters.Freeze();
 55        }
 56
 57        private class IgnoreFieldable : JsonConverter
 58        {
 59            public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer)
 60            {
 61                writer.WriteValue("IgnoredLuceueField");
 62            }
 63
 64            public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer)
 65            {
 66                return null;
 67            }
 68
 69            public override bool CanConvert(Type objectType)
 70            {
 71                return typeof(IFieldable).IsAssignableFrom(objectType) ||
 72                       typeof(IEnumerable<AbstractField>).IsAssignableFrom(objectType);
 73            }
 74        }
 75
 76        public MapReduceIndex(Directory directory, int id, IndexDefinition indexDefinition,
 77                              AbstractViewGenerator viewGenerator, WorkContext context)
 78            : base(directory, id, indexDefinition, viewGenerator, context)
 79        {
 80            jsonSerializer = JsonExtensions.CreateDefaultJsonSerializer();
 81            jsonSerializer.Converters = MapReduceConverters;
 82        }
 83
 84        public override bool IsMapReduce
 85        {
 86            get { return true; }
 87        }
 88
 89        public override IndexingPerformanceStats IndexDocuments(AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp, CancellationToken token)
 90        {
 91            token.ThrowIfCancellationRequested();
 92
 93            var count = 0;
 94            var sourceCount = 0;
 95            var deleted = new Dictionary<ReduceKeyAndBucket, int>();
 96            var performance = RecordCurrentBatch("Current Map", "Map", batch.Docs.Count);
 97            var performanceStats = new List<BasePerformanceStats>();
 98
 99            var usedStorageAccessors = new ConcurrentSet<IStorageActionsAccessor>();
100
101            if (usedStorageAccessors.TryAdd(actions))
102            {
103                var storageCommitDuration = new Stopwatch();
104
105                actions.BeforeStorageCommit += storageCommitDuration.Start;
106
107                actions.AfterStorageCommit += () =>
108                {
109                    storageCommitDuration.Stop();
110
111                    performanceStats.Add(PerformanceStats.From(IndexingOperation.StorageCommit, storageCommitDuration.ElapsedMilliseconds));
112                };
113            }
114
115            var deleteMappedResultsDuration = new Stopwatch();
116            var documentsWrapped = batch.Docs.Select(doc =>
117            {
118                token.ThrowIfCancellationRequested();
119
120                sourceCount++;
121                var documentId = doc.__document_id;
122
123                using (StopwatchScope.For(deleteMappedResultsDuration))
124                {
125                    actions.MapReduce.DeleteMappedResultsForDocumentId((string)documentId, indexId, deleted);
126                }
127
128                return doc;
129            })
130            .Where(x => x is FilteredDocument == false)
131            .ToList();
132
133            performanceStats.Add(new PerformanceStats
134            {
135                Name = IndexingOperation.Map_DeleteMappedResults,
136                DurationMs = deleteMappedResultsDuration.ElapsedMilliseconds,
137            });
138
139            var allReferencedDocs = new ConcurrentQueue<IDictionary<string, HashSet<string>>>();
140            var allReferenceEtags = new ConcurrentQueue<IDictionary<string, Etag>>();
141            var allState = new ConcurrentQueue<Tuple<HashSet<ReduceKeyAndBucket>, IndexingWorkStats, Dictionary<string, int>>>();
142
143            var parallelOperations = new ConcurrentQueue<ParallelBatchStats>();
144
145            var parallelProcessingStart = SystemTime.UtcNow;
146
147            context.Database.MappingThreadPool.ExecuteBatch(documentsWrapped, (IEnumerator<dynamic> partition) =>
148            {
149                token.ThrowIfCancellationRequested();
150                var parallelStats = new ParallelBatchStats
151                {
152                    StartDelay = (long)(SystemTime.UtcNow - parallelProcessingStart).TotalMilliseconds
153                };
154
155                var localStats = new IndexingWorkStats();
156                var localChanges = new HashSet<ReduceKeyAndBucket>();
157                var statsPerKey = new Dictionary<string, int>();
158
159                var linqExecutionDuration = new Stopwatch();
160                var reduceInMapLinqExecutionDuration = new Stopwatch();
161                var putMappedResultsDuration = new Stopwatch();
162                var convertToRavenJObjectDuration = new Stopwatch();
163
164                allState.Enqueue(Tuple.Create(localChanges, localStats, statsPerKey));
165
166                using (CurrentIndexingScope.Current = new CurrentIndexingScope(context.Database, PublicName))
167                {
168                    // we are writing to the transactional store from multiple threads here, and in a streaming fashion
169                    // should result in less memory and better perf
170                    context.TransactionalStorage.Batch(accessor =>
171                    {
172                        if (usedStorageAccessors.TryAdd(accessor))
173                        {
174                            var storageCommitDuration = new Stopwatch();
175
176                            accessor.BeforeStorageCommit += storageCommitDuration.Start;
177
178                            accessor.AfterStorageCommit += () =>
179                            {
180                                storageCommitDuration.Stop();
181
182                                parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.StorageCommit, storageCommitDuration.ElapsedMilliseconds));
183                            };
184                        }
185
186                        var mapResults = RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, localStats, linqExecutionDuration);
187                        var currentDocumentResults = new List<object>();
188                        string currentKey = null;
189                        bool skipDocument = false;
190
191                        foreach (var currentDoc in mapResults)
192                        {
193                            token.ThrowIfCancellationRequested();
194
195                            var documentId = GetDocumentId(currentDoc);
196                            if (documentId != currentKey)
197                            {
198                                count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, localChanges, accessor, statsPerKey, reduceInMapLinqExecutionDuration, putMappedResultsDuration, convertToRavenJObjectDuration);
199
200                                currentDocumentResults.Clear();
201                                currentKey = documentId;
202                            }
203                            else if (skipDocument)
204                            {
205                                continue;
206                            }
207
208                            RavenJObject currentDocJObject;
209                            using (StopwatchScope.For(convertToRavenJObjectDuration))
210                            {
211                                currentDocJObject = RavenJObject.FromObject(currentDoc, jsonSerializer);
212                            }
213
214                            currentDocumentResults.Add(new DynamicJsonObject(currentDocJObject));
215
216                            if (EnsureValidNumberOfOutputsForDocument(documentId, currentDocumentResults.Count) == false)
217                            {
218                                skipDocument = true;
219                                currentDocumentResults.Clear();
220                                continue;
221                            }
222
223                            Interlocked.Increment(ref localStats.IndexingSuccesses);
224                        }
225                        count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, localChanges, accessor, statsPerKey, reduceInMapLinqExecutionDuration, putMappedResultsDuration, convertToRavenJObjectDuration);
226
227                        parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.LoadDocument, CurrentIndexingScope.Current.LoadDocumentDuration.ElapsedMilliseconds));
228                        parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Linq_MapExecution, linqExecutionDuration.ElapsedMilliseconds));
229                        parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Linq_ReduceLinqExecution, reduceInMapLinqExecutionDuration.ElapsedMilliseconds));
230                        parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Map_PutMappedResults, putMappedResultsDuration.ElapsedMilliseconds));
231                        parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Map_ConvertToRavenJObject, convertToRavenJObjectDuration.ElapsedMilliseconds));
232
233                        parallelOperations.Enqueue(parallelStats);
234                    });
235
236                    allReferenceEtags.Enqueue(CurrentIndexingScope.Current.ReferencesEtags);
237                    allReferencedDocs.Enqueue(CurrentIndexingScope.Current.ReferencedDocuments);
238                }
239            }, description: string.Format("Reducing index {0} up to Etag {1}, for {2} documents", this.PublicName, batch.HighestEtagBeforeFiltering, documentsWrapped.Count));
240
241            performanceStats.Add(new ParallelPerformanceStats
242            {
243                NumberOfThreads = parallelOperations.Count,
244                DurationMs = (long)(SystemTime.UtcNow - parallelProcessingStart).TotalMilliseconds,
245                BatchedOperations = parallelOperations.ToList()
246            });
247
248            var updateDocumentReferencesDuration = new Stopwatch();
249            using (StopwatchScope.For(updateDocumentReferencesDuration))
250            {
251                UpdateDocumentReferences(actions, allReferencedDocs, allReferenceEtags);
252            }
253            performanceStats.Add(PerformanceStats.From(IndexingOperation.UpdateDocumentReferences, updateDocumentReferencesDuration.ElapsedMilliseconds));
254
255            var changed = allState.SelectMany(x => x.Item1).Concat(deleted.Keys)
256                    .Distinct()
257                    .ToList();
258
259            var stats = new IndexingWorkStats(allState.Select(x => x.Item2));
260            var reduceKeyStats = allState.SelectMany(x => x.Item3)
261                                         .GroupBy(x => x.Key)
262                                         .Select(g => new { g.Key, Count = g.Sum(x => x.Value) })
263                                         .ToList();
264
265            var reduceKeyToCount = new ConcurrentDictionary<string, int>();
266            foreach (var singleDeleted in deleted)
267            {
268                var reduceKey = singleDeleted.Key.ReduceKey;
269                reduceKeyToCount[reduceKey] = reduceKeyToCount.GetOrDefault(reduceKey) + singleDeleted.Value;
270            }
271
272            context.Database.MappingThreadPool.ExecuteBatch(reduceKeyStats, enumerator => context.TransactionalStorage.Batch(accessor =>
273            {
274                while (enumerator.MoveNext())
275                {
276                    var reduceKeyStat = enumerator.Current;
277                    var value = 0;
278                    reduceKeyToCount.TryRemove(reduceKeyStat.Key, out value);
279                    var changeValue = reduceKeyStat.Count - value;
280                    if (changeValue == 0)
281                    {
282                        // nothing to change
283                        continue;
284                    }
285
286                    accessor.MapReduce.ChangeReduceKeyCounterValue(indexId, reduceKeyStat.Key, changeValue);
287                }
288            }), description: string.Format("Incrementing Reducing key counter fo index {0} for operation from Etag {1} to Etag {2}", this.PublicName, this.GetLastEtagFromStats(), batch.HighestEtagBeforeFiltering));
289
290            foreach (var keyValuePair in reduceKeyToCount)
291            {
292                // those are the reduce keys that were replaced
293                actions.MapReduce.ChangeReduceKeyCounterValue(indexId, keyValuePair.Key, -keyValuePair.Value);
294            }
295
296            actions.General.MaybePulseTransaction();
297
298            var parallelReductionOperations = new ConcurrentQueue<ParallelBatchStats>();
299            var parallelReductionStart = SystemTime.UtcNow;
300
301            context.Database.MappingThreadPool.ExecuteBatch(changed, enumerator => context.TransactionalStorage.Batch(accessor =>
302            {
303                var parallelStats = new ParallelBatchStats
304                {
305                    StartDelay = (long)(SystemTime.UtcNow - parallelReductionStart).TotalMilliseconds
306                };
307
308                var scheduleReductionsDuration = new Stopwatch();
309
310                using (StopwatchScope.For(scheduleReductionsDuration))
311                {
312                    while (enumerator.MoveNext())
313                    {
314                        accessor.MapReduce.ScheduleReductions(indexId, 0, enumerator.Current);
315                        accessor.General.MaybePulseTransaction();
316                    }
317                }
318
319                parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Map_ScheduleReductions, scheduleReductionsDuration.ElapsedMilliseconds));
320                parallelReductionOperations.Enqueue(parallelStats);
321            }), description: string.Format("Map Scheduling Reducitions for index {0} after operation from Etag {1} to Etag {2}", this.PublicName, this.GetLastEtagFromStats(), batch.HighestEtagBeforeFiltering));
322
323            performanceStats.Add(new ParallelPerformanceStats
324            {
325                NumberOfThreads = parallelReductionOperations.Count,
326                DurationMs = (long)(SystemTime.UtcNow - parallelReductionStart).TotalMilliseconds,
327                BatchedOperations = parallelReductionOperations.ToList()
328            });
329
330            UpdateIndexingStats(context, stats);
331
332            performance.OnCompleted = () => BatchCompleted("Current Map", "Map", sourceCount, count, performanceStats);
333            if (logIndexing.IsDebugEnabled)
334            logIndexing.Debug("Mapped {0} documents for {1}", count, PublicName);
335
336            return performance;
337        }
338
339        private int ProcessBatch(AbstractViewGenerator viewGenerator, List<object> currentDocumentResults, string currentKey, HashSet<ReduceKeyAndBucket> changes,
340            IStorageActionsAccessor actions,
341            IDictionary<string, int> statsPerKey, Stopwatch reduceDuringMapLinqExecution, Stopwatch putMappedResultsDuration, Stopwatch convertToRavenJObjectDuration)
342        {
343            if (currentKey == null || currentDocumentResults.Count == 0)
344            {
345                return 0;
346            }
347
348            var old = CurrentIndexingScope.Current;
349            try
350            {
351                CurrentIndexingScope.Current = null;
352
353                if (logIndexing.IsDebugEnabled)
354                {
355                    var sb = new StringBuilder()
356                        .AppendFormat("Index {0} for document {1} resulted in:", PublicName, currentKey)
357                        .AppendLine();
358                    foreach (var currentDocumentResult in currentDocumentResults)
359                    {
360                        sb.AppendLine(JsonConvert.SerializeObject(currentDocumentResult));
361                    }
362                    logIndexing.Debug(sb.ToString());
363                }
364
365                int count = 0;
366
367                var results = RobustEnumerationReduceDuringMapPhase(currentDocumentResults.GetEnumerator(), viewGenerator.ReduceDefinition, reduceDuringMapLinqExecution);
368                foreach (var doc in results)
369                {
370                    count++;
371
372                    var reduceValue = viewGenerator.GroupByExtraction(doc);
373                    if (reduceValue == null)
374                    {
375                        if (logIndexing.IsDebugEnabled)
376                        logIndexing.Debug("Field {0} is used as the reduce key and cannot be null, skipping document {1}",
377                                          viewGenerator.GroupByExtraction, currentKey);
378                        continue;
379                    }
380                    string reduceKey = ReduceKeyToString(reduceValue);
381
382                    RavenJObject data;
383                    using (StopwatchScope.For(convertToRavenJObjectDuration))
384                    {
385                        data = GetMappedData(doc);
386                    }
387
388                    if (logIndexing.IsDebugEnabled)
389                    {
390                        logIndexing.Debug("Index {0} for document {1} resulted in ({2}): {3}", PublicName, currentKey, reduceKey, data);
391                    }
392
393                    using (StopwatchScope.For(putMappedResultsDuration))
394                    {
395                        actions.MapReduce.PutMappedResult(indexId, currentKey, reduceKey, data);
396                    }
397
398                    statsPerKey[reduceKey] = statsPerKey.GetOrDefault(reduceKey) + 1;
399                    actions.General.MaybePulseTransaction();
400                    changes.Add(new ReduceKeyAndBucket(IndexingUtil.MapBucket(currentKey), reduceKey));
401                }
402                return count;
403            }
404            finally
405            {
406                CurrentIndexingScope.Current = old;
407            }
408        }
409
410        private RavenJObject GetMappedData(object doc)
411        {
412            if (doc is IDynamicJsonObject)
413                return ((IDynamicJsonObject)doc).Inner;
414
415            var ravenJTokenWriter = new RavenJTokenWriter();
416            jsonSerializer.Serialize(ravenJTokenWriter, doc);
417            return (RavenJObject)ravenJTokenWriter.Token;
418        }
419
420        private static readonly ConcurrentDictionary<Type, Func<object, object>> documentIdFetcherCache =
421            new ConcurrentDictionary<Type, Func<object, object>>();
422
423        private static string GetDocumentId(object doc)
424        {
425            var docIdFetcher = documentIdFetcherCache.GetOrAdd(doc.GetType(), type =>
426            {
427                // document may be DynamicJsonObject if we are using compiled views
428                if (typeof(DynamicJsonObject) == type)
429                {
430                    return i => ((dynamic)i).__document_id;
431                }
432                var docIdProp = TypeDescriptor.GetProperties(doc).Find(Constants.DocumentIdFieldName, false);
433                return docIdProp.GetValue;
434            });
435            if (docIdFetcher == null)
436                throw new InvalidOperationException("Could not create document id fetcher for this document");
437            var documentId = docIdFetcher(doc);
438            if (documentId == null || documentId is DynamicNullObject)
439                throw new InvalidOperationException("Could not get document id fetcher for this document");
440
441            return (string)documentId;
442        }
443
444        internal static string ReduceKeyToString(object reduceValue)
445        {
446            var reduceValueAsString = reduceValue as string;
447            if (reduceValueAsString != null)
448                return reduceValueAsString;
449
450            if (reduceValue is DateTime)
451                return ((DateTime)reduceValue).GetDefaultRavenFormat();
452            if (reduceValue is DateTimeOffset)
453                return ((DateTimeOffset)reduceValue).ToString(Default.DateTimeFormatsToWrite, CultureInfo.InvariantCulture);
454            if (reduceValue is ValueType)
455                return reduceValue.ToString();
456
457            var dynamicJsonObject = reduceValue as IDynamicJsonObject;
458            if (dynamicJsonObject != null)
459                return dynamicJsonObject.Inner.ToString(Formatting.None);
460
461            return RavenJToken.FromObject(reduceValue).ToString(Formatting.None);
462        }
463
464        protected override IndexQueryResult RetrieveDocument(Document document, FieldsToFetch fieldsToFetch, ScoreDoc score)
465        {
466            fieldsToFetch.EnsureHasField(Constants.ReduceKeyFieldName);
467            if (fieldsToFetch.HasExplicitFieldsToFetch)
468            {
469                return base.RetrieveDocument(document, fieldsToFetch, score);
470            }
471            var field = document.GetField(Constants.ReduceValueFieldName);
472            if (field == null)
473            {
474                fieldsToFetch = fieldsToFetch.CloneWith(document.GetFields().Select(x => x.Name).ToArray());
475                return base.RetrieveDocument(document, fieldsToFetch, score);
476            }
477            var projection = RavenJObject.Parse(field.StringValue);
478            if (fieldsToFetch.FetchAllStoredFields)
479            {
480                var fields = new HashSet<string>(document.GetFields().Select(x => x.Name));
481                fields.Remove(Constants.ReduceKeyFieldName);
482                var documentFromFields = new RavenJObject();
483                AddFieldsToDocument(document, fields, documentFromFields);
484                foreach (var kvp in projection)
485                {
486                    documentFromFields[kvp.Key] = kvp.Value;
487                }
488                projection = documentFromFields;
489            }
490            return new IndexQueryResult
491            {
492                Projection = projection,
493                Score = score.Score,
494                ReduceVal = field.StringValue
495            };
496        }
497
498        protected override void HandleCommitPoints(IndexedItemsInfo itemsInfo, IndexSegmentsInfo segmentsInfo)
499        {
500            // MapReduce index does not store and use any commit points
501        }
502
503        protected override bool IsUpToDateEnoughToWriteToDisk(Etag highestETag)
504        {
505            // for map/reduce indexes, we always write to disk, the in memory optimization
506            // isn't really doing much for us, since we already write the intermediate results 
507            // to disk anyway, so it doesn't matter
508            return true;
509        }
510
511        public override void Remove(string[] keys, WorkContext context)
512        {
513            context.TransactionalStorage.Batch(actions =>
514            {
515                var reduceKeyAndBuckets = new Dictionary<ReduceKeyAndBucket, int>();
516                foreach (var key in keys)
517                {
518                    context.CancellationToken.ThrowIfCancellationRequested();
519                    actions.MapReduce.DeleteMappedResultsForDocumentId(key, indexId, reduceKeyAndBuckets);
520                }
521
522                actions.MapReduce.UpdateRemovedMapReduceStats(indexId, reduceKeyAndBuckets, context.CancellationToken);
523
524                foreach (var reduceKeyAndBucket in reduceKeyAndBuckets)
525                {
526                    context.CancellationToken.ThrowIfCancellationRequested();
527                    actions.MapReduce.ScheduleReductions(indexId, 0, reduceKeyAndBucket.Key);
528                }
529            });
530
531            Write((writer, analyzer, stats) =>
532            {
533                stats.Operation = IndexingWorkStats.Status.Ignore;
534                if (logIndexing.IsDebugEnabled)
535                logIndexing.Debug(() => string.Format("Deleting ({0}) from {1}", string.Join(", ", keys), PublicName));
536
537                var batchers = context.IndexUpdateTriggers.Select(x => x.CreateBatcher(indexId))
538                    .Where(x => x != null)
539                    .ToList();
540
541                keys.Apply(
542                    key =>
543                    InvokeOnIndexEntryDeletedOnAllBatchers(batchers, new Term(Constants.ReduceKeyFieldName, key.ToLowerInvariant())));
544
545                writer.DeleteDocuments(keys.Select(k => new Term(Constants.ReduceKeyFieldName, k.ToLowerInvariant())).ToArray());
546                batchers.ApplyAndIgnoreAllErrors(
547                    e =>
548                    {
549                        logIndexing.WarnException("Failed to dispose on index update trigger in " + PublicName, e);
550                        context.AddError(indexId, PublicName, null, e, "Dispose Trigger");
551                    },
552                    batcher => batcher.Dispose());
553
554                return new IndexedItemsInfo(null)
555                {
556                    ChangedDocs = keys.Length
557                };
558            });
559        }
560
561        public class ReduceDocuments
562        {
563            private readonly MapReduceIndex parent;
564            private readonly int inputCount;
565            private readonly int indexId;
566            private readonly AnonymousObjectToLuceneDocumentConverter anonymousObjectToLuceneDocumentConverter;
567            private readonly Document luceneDoc = new Document();
568
569            private readonly Field reduceValueField = new Field(Constants.ReduceValueFieldName, "dummy", Field.Store.YES, Field.Index.NO);
570            private readonly Field reduceKeyField = new Field(Constants.ReduceKeyFieldName, "dummy", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
571
572            private readonly ConcurrentDictionary<Type, PropertyAccessor> propertyAccessorCache = new ConcurrentDictionary<Type, PropertyAccessor>();
573            private readonly List<AbstractIndexUpdateTriggerBatcher> batchers;
574
575            public ReduceDocuments(MapReduceIndex parent, AbstractViewGenerator viewGenerator, IEnumerable<IGrouping<int, object>> mappedResultsByBucket, int level, WorkContext context, IStorageActionsAccessor actions, HashSet<string> reduceKeys, int inputCount)
576            {
577                this.parent = parent;
578                this.inputCount = inputCount;
579                indexId = this.parent.indexId;
580                ViewGenerator = viewGenerator;
581                MappedResultsByBucket = mappedResultsByBucket;
582                Level = level;
583                Context = context;
584                Actions = actions;
585                ReduceKeys = reduceKeys;
586
587                anonymousObjectToLuceneDocumentConverter = new AnonymousObjectToLuceneDocumentConverter(this.parent.context.Database, this.parent.indexDefinition, ViewGenerator, logIndexing);
588
589                if (Level == 2)
590                {
591                    batchers = Context.IndexUpdateTriggers.Select(x => x.CreateBatcher(indexId))
592                                .Where(x => x != null)
593                                .ToList();
594                }
595            }
596
597            public AbstractViewGenerator ViewGenerator { get; private set; }
598            public IEnumerable<IGrouping<int, object>> MappedResultsByBucket { get; private set; }
599            public int Level { get; private set; }
600            public WorkContext Context { get; private set; }
601            public IStorageActionsAccessor Actions { get; private set; }
602            public HashSet<string> ReduceKeys { get; private set; }
603
604            private string ExtractReduceKey(AbstractViewGenerator viewGenerator, object doc)
605            {
606                try
607                {
608                    object reduceKey = viewGenerator.GroupByExtraction(doc);
609                    if (reduceKey == null)
610                        throw new InvalidOperationException("Could not find reduce key for " + parent.PublicName + " in the result: " + doc);
611
612                    return ReduceKeyToString(reduceKey);
613                }
614                catch (Exception e)
615                {
616                    throw new InvalidOperationException("Could not extract reduce key from reduce result!", e);
617                }
618            }
619
620            private IEnumerable<AbstractField> GetFields(object doc, out float boost)
621            {
622                boost = 1;
623                var boostedValue = doc as BoostedValue;
624                if (boostedValue != null)
625                {
626                    doc = boostedValue.Value;
627                    boost = boostedValue.Boost;
628                }
629
630                IEnumerable<AbstractField> fields = null;
631                try
632                {
633                    var dynamicJsonObject = doc as IDynamicJsonObject;
634                    if (dynamicJsonObject != null)
635                    {
636                        fields = anonymousObjectToLuceneDocumentConverter.Index(dynamicJsonObject.Inner, Field.Store.NO);
637                    }
638                    else
639                    {
640                        var properties = propertyAccessorCache.GetOrAdd(doc.GetType(), PropertyAccessor.Create);
641                        fields = anonymousObjectToLuceneDocumentConverter.Index(doc, properties, Field.Store.NO);
642                    }
643                }
644                catch (InvalidShapeException)
645                {
646                }
647
648                if (Math.Abs(boost - 1) > float.Epsilon)
649                {
650                    return fields.Select(x => { x.OmitNorms = false; return x; });
651                }
652                return fields;
653            }
654
655            private static RavenJObject ToJsonDocument(object doc)
656            {
657                var boostedValue = doc as BoostedValue;
658                if (boostedValue != null)
659                {
660                    doc = boostedValue.Value;
661                }
662                var dynamicJsonObject = doc as IDynamicJsonObject;
663                if (dynamicJsonObject != null)
664                {
665                    return dynamicJsonObject.Inner;
666                }
667                var ravenJObject = doc as RavenJObject;
668                if (ravenJObject != null)
669                    return ravenJObject;
670                var jsonDocument = RavenJObject.FromObject(doc);
671                MergeArrays(jsonDocument);
672
673                // remove _, __, etc fields
674                foreach (var prop in jsonDocument.Where(x => x.Key.All(ch => ch == '_')).ToArray())
675                {
676                    jsonDocument.Remove(prop.Key);
677                }
678                return jsonDocument;
679            }
680
681            private static void MergeArrays(RavenJToken token)
682            {
683                if (token == null)
684                    return;
685                switch (token.Type)
686                {
687                    case JTokenType.Array:
688                        var arr = (RavenJArray)token;
689                        for (int i = 0; i < arr.Length; i++)
690                        {
691                            var current = arr[i];
692                            if (current == null || current.Type != JTokenType.Array)
693                                continue;
694                            arr.RemoveAt(i);
695                            i--;
696                            var j = Math.Max(0, i);
697                            foreach (var item in (RavenJArray)current)
698                            {
699                                arr.Insert(j++, item);
700                            }
701                        }
702                        break;
703                    case JTokenType.Object:
704                        foreach (var kvp in ((RavenJObject)token))
705                        {
706                            MergeArrays(kvp.Value);
707                        }
708                        break;
709                }
710            }
711
712            public IndexingPerformanceStats ExecuteReduction()
713            {
714                var count = 0;
715                var sourceCount = 0;
716                var addDocumentDuration = new Stopwatch();
717                var convertToLuceneDocumentDuration = new Stopwatch();
718                var linqExecutionDuration = new Stopwatch();
719                var deleteExistingDocumentsDuration = new Stopwatch();
720                var writeToIndexStats = new List<PerformanceStats>();
721
722                IndexingPerformanceStats performance = null;
723
724                parent.Write((indexWriter, analyzer, stats) =>
725                {
726                    stats.Operation = IndexingWorkStats.Status.Reduce;
727
728                    try
729                    {                                                
730                        if (Level == 2)
731                        {
732                            RemoveExistingReduceKeysFromIndex(indexWriter, deleteExistingDocumentsDuration);
733                        }
734                        
735                        foreach (var mappedResults in MappedResultsByBucket)
736                        {
737                            var input = mappedResults.Select(x =>
738                            {
739                                sourceCount++;
740                                return x;
741                            });
742
743                            IndexingFunc reduceDefinition = ViewGenerator.ReduceDefinition;
744                            foreach (var doc in parent.RobustEnumerationReduce(input.GetEnumerator(), reduceDefinition, stats, linqExecutionDuration))
745                            {
746                                count++;
747
748                                switch (Level)
749                                {
750                                    case 0:
751                                    case 1:
752                                        string reduceKeyAsString = ExtractReduceKey(ViewGenerator, doc);
753                                        Actions.MapReduce.PutReducedResult(indexId, reduceKeyAsString, Level + 1, mappedResults.Key, mappedResults.Key / 1024, ToJsonDocument(doc));
754                                        Actions.General.MaybePulseTransaction();
755                                        break;
756                                    case 2:
757                                        WriteDocumentToIndex(doc, indexWriter, analyzer, convertToLuceneDocumentDuration, addDocumentDuration);
758                                        break;
759                                    default:
760                                        throw new InvalidOperationException("Unknown level: " + Level);
761                                }
762
763                                stats.ReduceSuccesses++;
764                            }
765                        }                        
766                    }
767                    catch (Exception e)
768                    {
769                        if (Level == 2)
770                        {
771                            batchers.ApplyAndIgnoreAllErrors(
772                                ex =>
773                                {
774                                    logIndexing.WarnException("Failed to notify index update trigger batcher about an error in " + parent.PublicName, ex);
775                                    Context.AddError(indexId, parent.indexDefinition.Name, null, ex, "AnErrorOccured Trigger");
776                                },
777                                x => x.AnErrorOccured(e));
778                        }
779                        throw;
780                    }
781                    finally
782                    {
783                        if (Level == 2)
784                        {
785                            batchers.ApplyAndIgnoreAllErrors(
786                                e =>
787                                {
788                                    logIndexing.WarnException("Failed to dispose on index update trigger in " + parent.PublicName, e);
789                                    Context.AddError(indexId, parent.indexDefinition.Name, null, e, "Dispose Trigger");
790                                },
791                                x => x.Dispose());
792                        }
793
794                        // TODO: Check if we need to report "Bucket Counts" or "Total Input Elements"?
795                        performance = parent.RecordCurrentBatch("Current Reduce #" + Level, "Reduce Level " + Level, sourceCount);
796                    }
797
798                    return new IndexedItemsInfo(null)
799                    {
800                        ChangedDocs = count + ReduceKeys.Count
801                    };
802                }, writeToIndexStats);
803
804                var performanceStats = new List<BasePerformanceStats>();
805
806                performanceStats.Add(PerformanceStats.From(IndexingOperation.Linq_ReduceLinqExecution, linqExecutionDuration.ElapsedMilliseconds));
807                performanceStats.Add(PerformanceStats.From(IndexingOperation.Lucene_DeleteExistingDocument, deleteExistingDocumentsDuration.ElapsedMilliseconds));
808                performanceStats.Add(PerformanceStats.From(IndexingOperation.Lucene_ConvertToLuceneDocument, convertToLuceneDocumentDuration.ElapsedMilliseconds));
809                performanceStats.Add(PerformanceStats.From(IndexingOperation.Lucene_AddDocument, addDocumentDuration.ElapsedMilliseconds));
810                performanceStats.AddRange(writeToIndexStats);
811
812                parent.BatchCompleted("Current Reduce #" + Level, "Reduce Level " + Level, sourceCount, count, performanceStats);
813                if (logIndexing.IsDebugEnabled)
814                logIndexing.Debug(() => string.Format("Reduce resulted in {0} entries for {1} for reduce keys at level {3}: {2}", count, parent.PublicName, string.Join(", ", ReduceKeys), Level));
815
816                return performance;
817            }
818
819            private void WriteDocumentToIndex(object doc, RavenIndexWriter indexWriter, Analyzer analyzer, Stopwatch convertToLuceneDocumentDuration, Stopwatch addDocumentDutation)
820            {
821                string reduceKeyAsString;
822                using (StopwatchScope.For(convertToLuceneDocumentDuration))
823                {
824                    float boost;
825                    try
826                    {
827                        var fields = GetFields(doc, out boost);
828
829                        reduceKeyAsString = ExtractReduceKey(ViewGenerator, doc);
830                        reduceKeyField.SetValue(reduceKeyAsString);
831                        reduceValueField.SetValue(ToJsonDocument(doc).ToString(Formatting.None));
832
833                        luceneDoc.GetFields().Clear();
834                        luceneDoc.Boost = boost;
835                        luceneDoc.Add(reduceKeyField);
836                        luceneDoc.Add(reduceValueField);
837
838                        foreach (var field in fields)
839                            luceneDoc.Add(field);
840                    }
841                    catch (Exception e)
842                    {
843                        Context.AddError(indexId,
844                            parent.PublicName,
845                            TryGetDocKey(doc),
846                            e,
847                            "Reduce"
848                            );
849                        logIndexing.WarnException("Could not get fields to during reduce for " + parent.PublicName, e);
850                        return;
851                    }
852                }
853                batchers.ApplyAndIgnoreAllErrors(
854                    exception =>
855                    {
856                        logIndexing.WarnException(
857                            string.Format("Error when executed OnIndexEntryCreated trigger for index '{0}', key: '{1}'",
858                                          parent.PublicName, reduceKeyAsString),
859                            exception);
860                        Context.AddError(indexId, parent.PublicName, reduceKeyAsString, exception, "OnIndexEntryCreated Trigger");
861                    },
862                    trigger => trigger.OnIndexEntryCreated(reduceKeyAsString, luceneDoc));
863
864                parent.LogIndexedDocument(reduceKeyAsString, luceneDoc);
865
866                using (StopwatchScope.For(addDocumentDutation))
867                {
868                    parent.AddDocumentToIndex(indexWriter, luceneDoc, analyzer);
869                }
870            }
871
872            private void RemoveExistingReduceKeysFromIndex(RavenIndexWriter indexWriter, Stopwatch deleteExistingDocumentsDuration)
873            {
874                foreach (var reduceKey in ReduceKeys)
875                {
876                    var entryKey = reduceKey;
877                    parent.InvokeOnIndexEntryDeletedOnAllBatchers(batchers, new Term(Constants.ReduceKeyFieldName, entryKey));
878
879                    using (StopwatchScope.For(deleteExistingDocumentsDuration))
880                    {
881                        indexWriter.DeleteDocuments(new Term(Constants.ReduceKeyFieldName, entryKey));
882                    }
883                }
884            }
885        }
886    }
887}