PageRenderTime 60ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/ToMigrate/Raven.Database/Indexing/MapReduceIndex.cs

http://github.com/ayende/ravendb
C# | 887 lines | 743 code | 128 blank | 16 comment | 77 complexity | b6d38a3b52086945c3e703a8d2d97936 MD5 | raw file
Possible License(s): GPL-3.0, MPL-2.0-no-copyleft-exception, LGPL-2.1, Apache-2.0, BSD-3-Clause, CC-BY-SA-3.0
  1. //-----------------------------------------------------------------------
  2. // <copyright file="MapReduceIndex.cs" company="Hibernating Rhinos LTD">
  3. // Copyright (c) Hibernating Rhinos LTD. All rights reserved.
  4. // </copyright>
  5. //-----------------------------------------------------------------------
  6. using System;
  7. using System.Collections;
  8. using System.Collections.Concurrent;
  9. using System.Collections.Generic;
  10. using System.ComponentModel;
  11. using System.Diagnostics;
  12. using System.Globalization;
  13. using System.Linq;
  14. using System.Text;
  15. using System.Threading;
  16. using Lucene.Net.Analysis;
  17. using Lucene.Net.Documents;
  18. using Lucene.Net.Index;
  19. using Lucene.Net.Search;
  20. using Lucene.Net.Store;
  21. using Raven.Abstractions.Logging;
  22. using Raven.Database.Extensions;
  23. using Raven.Database.Plugins;
  24. using Raven.Database.Util;
  25. using Raven.Imports.Newtonsoft.Json;
  26. using Raven.Abstractions;
  27. using Raven.Abstractions.Extensions;
  28. using Raven.Abstractions.Data;
  29. using Raven.Abstractions.Indexing;
  30. using Raven.Abstractions.Linq;
  31. using Raven.Database.Data;
  32. using Raven.Database.Linq;
  33. using Raven.Database.Storage;
  34. using Raven.Imports.Newtonsoft.Json.Linq;
  35. using Raven.Json.Linq;
  36. using Spatial4n.Core.Exceptions;
  37. using Sparrow.Collections;
  38. namespace Raven.Database.Indexing
  39. {
  40. internal class MapReduceIndex : Index
  41. {
  42. readonly JsonSerializer jsonSerializer;
  43. private static readonly JsonConverterCollection MapReduceConverters;
  44. static MapReduceIndex()
  45. {
  46. MapReduceConverters = new JsonConverterCollection(Default.Converters)
  47. {
  48. new IgnoreFieldable()
  49. };
  50. MapReduceConverters.Freeze();
  51. }
  52. private class IgnoreFieldable : JsonConverter
  53. {
  54. public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer)
  55. {
  56. writer.WriteValue("IgnoredLuceueField");
  57. }
  58. public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer)
  59. {
  60. return null;
  61. }
  62. public override bool CanConvert(Type objectType)
  63. {
  64. return typeof(IFieldable).IsAssignableFrom(objectType) ||
  65. typeof(IEnumerable<AbstractField>).IsAssignableFrom(objectType);
  66. }
  67. }
  68. public MapReduceIndex(Directory directory, int id, IndexDefinition indexDefinition,
  69. AbstractViewGenerator viewGenerator, WorkContext context)
  70. : base(directory, id, indexDefinition, viewGenerator, context)
  71. {
  72. jsonSerializer = JsonExtensions.CreateDefaultJsonSerializer();
  73. jsonSerializer.Converters = MapReduceConverters;
  74. }
  75. public override bool IsMapReduce
  76. {
  77. get { return true; }
  78. }
  79. public override IndexingPerformanceStats IndexDocuments(AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp, CancellationToken token)
  80. {
  81. token.ThrowIfCancellationRequested();
  82. var count = 0;
  83. var sourceCount = 0;
  84. var deleted = new Dictionary<ReduceKeyAndBucket, int>();
  85. var performance = RecordCurrentBatch("Current Map", "Map", batch.Docs.Count);
  86. var performanceStats = new List<BasePerformanceStats>();
  87. var usedStorageAccessors = new ConcurrentSet<IStorageActionsAccessor>();
  88. if (usedStorageAccessors.TryAdd(actions))
  89. {
  90. var storageCommitDuration = new Stopwatch();
  91. actions.BeforeStorageCommit += storageCommitDuration.Start;
  92. actions.AfterStorageCommit += () =>
  93. {
  94. storageCommitDuration.Stop();
  95. performanceStats.Add(PerformanceStats.From(IndexingOperation.StorageCommit, storageCommitDuration.ElapsedMilliseconds));
  96. };
  97. }
  98. var deleteMappedResultsDuration = new Stopwatch();
  99. var documentsWrapped = batch.Docs.Select(doc =>
  100. {
  101. token.ThrowIfCancellationRequested();
  102. sourceCount++;
  103. var documentId = doc.__document_id;
  104. using (StopwatchScope.For(deleteMappedResultsDuration))
  105. {
  106. actions.MapReduce.DeleteMappedResultsForDocumentId((string)documentId, indexId, deleted);
  107. }
  108. return doc;
  109. })
  110. .Where(x => x is FilteredDocument == false)
  111. .ToList();
  112. performanceStats.Add(new PerformanceStats
  113. {
  114. Name = IndexingOperation.Map_DeleteMappedResults,
  115. DurationMs = deleteMappedResultsDuration.ElapsedMilliseconds,
  116. });
  117. var allReferencedDocs = new ConcurrentQueue<IDictionary<string, HashSet<string>>>();
  118. var allReferenceEtags = new ConcurrentQueue<IDictionary<string, Etag>>();
  119. var allState = new ConcurrentQueue<Tuple<HashSet<ReduceKeyAndBucket>, IndexingWorkStats, Dictionary<string, int>>>();
  120. var parallelOperations = new ConcurrentQueue<ParallelBatchStats>();
  121. var parallelProcessingStart = SystemTime.UtcNow;
  122. context.Database.MappingThreadPool.ExecuteBatch(documentsWrapped, (IEnumerator<dynamic> partition) =>
  123. {
  124. token.ThrowIfCancellationRequested();
  125. var parallelStats = new ParallelBatchStats
  126. {
  127. StartDelay = (long)(SystemTime.UtcNow - parallelProcessingStart).TotalMilliseconds
  128. };
  129. var localStats = new IndexingWorkStats();
  130. var localChanges = new HashSet<ReduceKeyAndBucket>();
  131. var statsPerKey = new Dictionary<string, int>();
  132. var linqExecutionDuration = new Stopwatch();
  133. var reduceInMapLinqExecutionDuration = new Stopwatch();
  134. var putMappedResultsDuration = new Stopwatch();
  135. var convertToRavenJObjectDuration = new Stopwatch();
  136. allState.Enqueue(Tuple.Create(localChanges, localStats, statsPerKey));
  137. using (CurrentIndexingScope.Current = new CurrentIndexingScope(context.Database, PublicName))
  138. {
  139. // we are writing to the transactional store from multiple threads here, and in a streaming fashion
  140. // should result in less memory and better perf
  141. context.TransactionalStorage.Batch(accessor =>
  142. {
  143. if (usedStorageAccessors.TryAdd(accessor))
  144. {
  145. var storageCommitDuration = new Stopwatch();
  146. accessor.BeforeStorageCommit += storageCommitDuration.Start;
  147. accessor.AfterStorageCommit += () =>
  148. {
  149. storageCommitDuration.Stop();
  150. parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.StorageCommit, storageCommitDuration.ElapsedMilliseconds));
  151. };
  152. }
  153. var mapResults = RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, localStats, linqExecutionDuration);
  154. var currentDocumentResults = new List<object>();
  155. string currentKey = null;
  156. bool skipDocument = false;
  157. foreach (var currentDoc in mapResults)
  158. {
  159. token.ThrowIfCancellationRequested();
  160. var documentId = GetDocumentId(currentDoc);
  161. if (documentId != currentKey)
  162. {
  163. count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, localChanges, accessor, statsPerKey, reduceInMapLinqExecutionDuration, putMappedResultsDuration, convertToRavenJObjectDuration);
  164. currentDocumentResults.Clear();
  165. currentKey = documentId;
  166. }
  167. else if (skipDocument)
  168. {
  169. continue;
  170. }
  171. RavenJObject currentDocJObject;
  172. using (StopwatchScope.For(convertToRavenJObjectDuration))
  173. {
  174. currentDocJObject = RavenJObject.FromObject(currentDoc, jsonSerializer);
  175. }
  176. currentDocumentResults.Add(new DynamicJsonObject(currentDocJObject));
  177. if (EnsureValidNumberOfOutputsForDocument(documentId, currentDocumentResults.Count) == false)
  178. {
  179. skipDocument = true;
  180. currentDocumentResults.Clear();
  181. continue;
  182. }
  183. Interlocked.Increment(ref localStats.IndexingSuccesses);
  184. }
  185. count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, localChanges, accessor, statsPerKey, reduceInMapLinqExecutionDuration, putMappedResultsDuration, convertToRavenJObjectDuration);
  186. parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.LoadDocument, CurrentIndexingScope.Current.LoadDocumentDuration.ElapsedMilliseconds));
  187. parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Linq_MapExecution, linqExecutionDuration.ElapsedMilliseconds));
  188. parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Linq_ReduceLinqExecution, reduceInMapLinqExecutionDuration.ElapsedMilliseconds));
  189. parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Map_PutMappedResults, putMappedResultsDuration.ElapsedMilliseconds));
  190. parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Map_ConvertToRavenJObject, convertToRavenJObjectDuration.ElapsedMilliseconds));
  191. parallelOperations.Enqueue(parallelStats);
  192. });
  193. allReferenceEtags.Enqueue(CurrentIndexingScope.Current.ReferencesEtags);
  194. allReferencedDocs.Enqueue(CurrentIndexingScope.Current.ReferencedDocuments);
  195. }
  196. }, description: string.Format("Reducing index {0} up to Etag {1}, for {2} documents", this.PublicName, batch.HighestEtagBeforeFiltering, documentsWrapped.Count));
  197. performanceStats.Add(new ParallelPerformanceStats
  198. {
  199. NumberOfThreads = parallelOperations.Count,
  200. DurationMs = (long)(SystemTime.UtcNow - parallelProcessingStart).TotalMilliseconds,
  201. BatchedOperations = parallelOperations.ToList()
  202. });
  203. var updateDocumentReferencesDuration = new Stopwatch();
  204. using (StopwatchScope.For(updateDocumentReferencesDuration))
  205. {
  206. UpdateDocumentReferences(actions, allReferencedDocs, allReferenceEtags);
  207. }
  208. performanceStats.Add(PerformanceStats.From(IndexingOperation.UpdateDocumentReferences, updateDocumentReferencesDuration.ElapsedMilliseconds));
  209. var changed = allState.SelectMany(x => x.Item1).Concat(deleted.Keys)
  210. .Distinct()
  211. .ToList();
  212. var stats = new IndexingWorkStats(allState.Select(x => x.Item2));
  213. var reduceKeyStats = allState.SelectMany(x => x.Item3)
  214. .GroupBy(x => x.Key)
  215. .Select(g => new { g.Key, Count = g.Sum(x => x.Value) })
  216. .ToList();
  217. var reduceKeyToCount = new ConcurrentDictionary<string, int>();
  218. foreach (var singleDeleted in deleted)
  219. {
  220. var reduceKey = singleDeleted.Key.ReduceKey;
  221. reduceKeyToCount[reduceKey] = reduceKeyToCount.GetOrDefault(reduceKey) + singleDeleted.Value;
  222. }
  223. context.Database.MappingThreadPool.ExecuteBatch(reduceKeyStats, enumerator => context.TransactionalStorage.Batch(accessor =>
  224. {
  225. while (enumerator.MoveNext())
  226. {
  227. var reduceKeyStat = enumerator.Current;
  228. var value = 0;
  229. reduceKeyToCount.TryRemove(reduceKeyStat.Key, out value);
  230. var changeValue = reduceKeyStat.Count - value;
  231. if (changeValue == 0)
  232. {
  233. // nothing to change
  234. continue;
  235. }
  236. accessor.MapReduce.ChangeReduceKeyCounterValue(indexId, reduceKeyStat.Key, changeValue);
  237. }
  238. }), description: string.Format("Incrementing Reducing key counter fo index {0} for operation from Etag {1} to Etag {2}", this.PublicName, this.GetLastEtagFromStats(), batch.HighestEtagBeforeFiltering));
  239. foreach (var keyValuePair in reduceKeyToCount)
  240. {
  241. // those are the reduce keys that were replaced
  242. actions.MapReduce.ChangeReduceKeyCounterValue(indexId, keyValuePair.Key, -keyValuePair.Value);
  243. }
  244. actions.General.MaybePulseTransaction();
  245. var parallelReductionOperations = new ConcurrentQueue<ParallelBatchStats>();
  246. var parallelReductionStart = SystemTime.UtcNow;
  247. context.Database.MappingThreadPool.ExecuteBatch(changed, enumerator => context.TransactionalStorage.Batch(accessor =>
  248. {
  249. var parallelStats = new ParallelBatchStats
  250. {
  251. StartDelay = (long)(SystemTime.UtcNow - parallelReductionStart).TotalMilliseconds
  252. };
  253. var scheduleReductionsDuration = new Stopwatch();
  254. using (StopwatchScope.For(scheduleReductionsDuration))
  255. {
  256. while (enumerator.MoveNext())
  257. {
  258. accessor.MapReduce.ScheduleReductions(indexId, 0, enumerator.Current);
  259. accessor.General.MaybePulseTransaction();
  260. }
  261. }
  262. parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Map_ScheduleReductions, scheduleReductionsDuration.ElapsedMilliseconds));
  263. parallelReductionOperations.Enqueue(parallelStats);
  264. }), description: string.Format("Map Scheduling Reducitions for index {0} after operation from Etag {1} to Etag {2}", this.PublicName, this.GetLastEtagFromStats(), batch.HighestEtagBeforeFiltering));
  265. performanceStats.Add(new ParallelPerformanceStats
  266. {
  267. NumberOfThreads = parallelReductionOperations.Count,
  268. DurationMs = (long)(SystemTime.UtcNow - parallelReductionStart).TotalMilliseconds,
  269. BatchedOperations = parallelReductionOperations.ToList()
  270. });
  271. UpdateIndexingStats(context, stats);
  272. performance.OnCompleted = () => BatchCompleted("Current Map", "Map", sourceCount, count, performanceStats);
  273. if (logIndexing.IsDebugEnabled)
  274. logIndexing.Debug("Mapped {0} documents for {1}", count, PublicName);
  275. return performance;
  276. }
  277. private int ProcessBatch(AbstractViewGenerator viewGenerator, List<object> currentDocumentResults, string currentKey, HashSet<ReduceKeyAndBucket> changes,
  278. IStorageActionsAccessor actions,
  279. IDictionary<string, int> statsPerKey, Stopwatch reduceDuringMapLinqExecution, Stopwatch putMappedResultsDuration, Stopwatch convertToRavenJObjectDuration)
  280. {
  281. if (currentKey == null || currentDocumentResults.Count == 0)
  282. {
  283. return 0;
  284. }
  285. var old = CurrentIndexingScope.Current;
  286. try
  287. {
  288. CurrentIndexingScope.Current = null;
  289. if (logIndexing.IsDebugEnabled)
  290. {
  291. var sb = new StringBuilder()
  292. .AppendFormat("Index {0} for document {1} resulted in:", PublicName, currentKey)
  293. .AppendLine();
  294. foreach (var currentDocumentResult in currentDocumentResults)
  295. {
  296. sb.AppendLine(JsonConvert.SerializeObject(currentDocumentResult));
  297. }
  298. logIndexing.Debug(sb.ToString());
  299. }
  300. int count = 0;
  301. var results = RobustEnumerationReduceDuringMapPhase(currentDocumentResults.GetEnumerator(), viewGenerator.ReduceDefinition, reduceDuringMapLinqExecution);
  302. foreach (var doc in results)
  303. {
  304. count++;
  305. var reduceValue = viewGenerator.GroupByExtraction(doc);
  306. if (reduceValue == null)
  307. {
  308. if (logIndexing.IsDebugEnabled)
  309. logIndexing.Debug("Field {0} is used as the reduce key and cannot be null, skipping document {1}",
  310. viewGenerator.GroupByExtraction, currentKey);
  311. continue;
  312. }
  313. string reduceKey = ReduceKeyToString(reduceValue);
  314. RavenJObject data;
  315. using (StopwatchScope.For(convertToRavenJObjectDuration))
  316. {
  317. data = GetMappedData(doc);
  318. }
  319. if (logIndexing.IsDebugEnabled)
  320. {
  321. logIndexing.Debug("Index {0} for document {1} resulted in ({2}): {3}", PublicName, currentKey, reduceKey, data);
  322. }
  323. using (StopwatchScope.For(putMappedResultsDuration))
  324. {
  325. actions.MapReduce.PutMappedResult(indexId, currentKey, reduceKey, data);
  326. }
  327. statsPerKey[reduceKey] = statsPerKey.GetOrDefault(reduceKey) + 1;
  328. actions.General.MaybePulseTransaction();
  329. changes.Add(new ReduceKeyAndBucket(IndexingUtil.MapBucket(currentKey), reduceKey));
  330. }
  331. return count;
  332. }
  333. finally
  334. {
  335. CurrentIndexingScope.Current = old;
  336. }
  337. }
  338. private RavenJObject GetMappedData(object doc)
  339. {
  340. if (doc is IDynamicJsonObject)
  341. return ((IDynamicJsonObject)doc).Inner;
  342. var ravenJTokenWriter = new RavenJTokenWriter();
  343. jsonSerializer.Serialize(ravenJTokenWriter, doc);
  344. return (RavenJObject)ravenJTokenWriter.Token;
  345. }
  346. private static readonly ConcurrentDictionary<Type, Func<object, object>> documentIdFetcherCache =
  347. new ConcurrentDictionary<Type, Func<object, object>>();
  348. private static string GetDocumentId(object doc)
  349. {
  350. var docIdFetcher = documentIdFetcherCache.GetOrAdd(doc.GetType(), type =>
  351. {
  352. // document may be DynamicJsonObject if we are using compiled views
  353. if (typeof(DynamicJsonObject) == type)
  354. {
  355. return i => ((dynamic)i).__document_id;
  356. }
  357. var docIdProp = TypeDescriptor.GetProperties(doc).Find(Constants.DocumentIdFieldName, false);
  358. return docIdProp.GetValue;
  359. });
  360. if (docIdFetcher == null)
  361. throw new InvalidOperationException("Could not create document id fetcher for this document");
  362. var documentId = docIdFetcher(doc);
  363. if (documentId == null || documentId is DynamicNullObject)
  364. throw new InvalidOperationException("Could not get document id fetcher for this document");
  365. return (string)documentId;
  366. }
  367. internal static string ReduceKeyToString(object reduceValue)
  368. {
  369. var reduceValueAsString = reduceValue as string;
  370. if (reduceValueAsString != null)
  371. return reduceValueAsString;
  372. if (reduceValue is DateTime)
  373. return ((DateTime)reduceValue).GetDefaultRavenFormat();
  374. if (reduceValue is DateTimeOffset)
  375. return ((DateTimeOffset)reduceValue).ToString(Default.DateTimeFormatsToWrite, CultureInfo.InvariantCulture);
  376. if (reduceValue is ValueType)
  377. return reduceValue.ToString();
  378. var dynamicJsonObject = reduceValue as IDynamicJsonObject;
  379. if (dynamicJsonObject != null)
  380. return dynamicJsonObject.Inner.ToString(Formatting.None);
  381. return RavenJToken.FromObject(reduceValue).ToString(Formatting.None);
  382. }
  383. protected override IndexQueryResult RetrieveDocument(Document document, FieldsToFetch fieldsToFetch, ScoreDoc score)
  384. {
  385. fieldsToFetch.EnsureHasField(Constants.ReduceKeyFieldName);
  386. if (fieldsToFetch.HasExplicitFieldsToFetch)
  387. {
  388. return base.RetrieveDocument(document, fieldsToFetch, score);
  389. }
  390. var field = document.GetField(Constants.ReduceValueFieldName);
  391. if (field == null)
  392. {
  393. fieldsToFetch = fieldsToFetch.CloneWith(document.GetFields().Select(x => x.Name).ToArray());
  394. return base.RetrieveDocument(document, fieldsToFetch, score);
  395. }
  396. var projection = RavenJObject.Parse(field.StringValue);
  397. if (fieldsToFetch.FetchAllStoredFields)
  398. {
  399. var fields = new HashSet<string>(document.GetFields().Select(x => x.Name));
  400. fields.Remove(Constants.ReduceKeyFieldName);
  401. var documentFromFields = new RavenJObject();
  402. AddFieldsToDocument(document, fields, documentFromFields);
  403. foreach (var kvp in projection)
  404. {
  405. documentFromFields[kvp.Key] = kvp.Value;
  406. }
  407. projection = documentFromFields;
  408. }
  409. return new IndexQueryResult
  410. {
  411. Projection = projection,
  412. Score = score.Score,
  413. ReduceVal = field.StringValue
  414. };
  415. }
  416. protected override void HandleCommitPoints(IndexedItemsInfo itemsInfo, IndexSegmentsInfo segmentsInfo)
  417. {
  418. // MapReduce index does not store and use any commit points
  419. }
  420. protected override bool IsUpToDateEnoughToWriteToDisk(Etag highestETag)
  421. {
  422. // for map/reduce indexes, we always write to disk, the in memory optimization
  423. // isn't really doing much for us, since we already write the intermediate results
  424. // to disk anyway, so it doesn't matter
  425. return true;
  426. }
  427. public override void Remove(string[] keys, WorkContext context)
  428. {
  429. context.TransactionalStorage.Batch(actions =>
  430. {
  431. var reduceKeyAndBuckets = new Dictionary<ReduceKeyAndBucket, int>();
  432. foreach (var key in keys)
  433. {
  434. context.CancellationToken.ThrowIfCancellationRequested();
  435. actions.MapReduce.DeleteMappedResultsForDocumentId(key, indexId, reduceKeyAndBuckets);
  436. }
  437. actions.MapReduce.UpdateRemovedMapReduceStats(indexId, reduceKeyAndBuckets, context.CancellationToken);
  438. foreach (var reduceKeyAndBucket in reduceKeyAndBuckets)
  439. {
  440. context.CancellationToken.ThrowIfCancellationRequested();
  441. actions.MapReduce.ScheduleReductions(indexId, 0, reduceKeyAndBucket.Key);
  442. }
  443. });
  444. Write((writer, analyzer, stats) =>
  445. {
  446. stats.Operation = IndexingWorkStats.Status.Ignore;
  447. if (logIndexing.IsDebugEnabled)
  448. logIndexing.Debug(() => string.Format("Deleting ({0}) from {1}", string.Join(", ", keys), PublicName));
  449. var batchers = context.IndexUpdateTriggers.Select(x => x.CreateBatcher(indexId))
  450. .Where(x => x != null)
  451. .ToList();
  452. keys.Apply(
  453. key =>
  454. InvokeOnIndexEntryDeletedOnAllBatchers(batchers, new Term(Constants.ReduceKeyFieldName, key.ToLowerInvariant())));
  455. writer.DeleteDocuments(keys.Select(k => new Term(Constants.ReduceKeyFieldName, k.ToLowerInvariant())).ToArray());
  456. batchers.ApplyAndIgnoreAllErrors(
  457. e =>
  458. {
  459. logIndexing.WarnException("Failed to dispose on index update trigger in " + PublicName, e);
  460. context.AddError(indexId, PublicName, null, e, "Dispose Trigger");
  461. },
  462. batcher => batcher.Dispose());
  463. return new IndexedItemsInfo(null)
  464. {
  465. ChangedDocs = keys.Length
  466. };
  467. });
  468. }
  469. public class ReduceDocuments
  470. {
  471. private readonly MapReduceIndex parent;
  472. private readonly int inputCount;
  473. private readonly int indexId;
  474. private readonly AnonymousObjectToLuceneDocumentConverter anonymousObjectToLuceneDocumentConverter;
  475. private readonly Document luceneDoc = new Document();
  476. private readonly Field reduceValueField = new Field(Constants.ReduceValueFieldName, "dummy", Field.Store.YES, Field.Index.NO);
  477. private readonly Field reduceKeyField = new Field(Constants.ReduceKeyFieldName, "dummy", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
  478. private readonly ConcurrentDictionary<Type, PropertyAccessor> propertyAccessorCache = new ConcurrentDictionary<Type, PropertyAccessor>();
  479. private readonly List<AbstractIndexUpdateTriggerBatcher> batchers;
  480. public ReduceDocuments(MapReduceIndex parent, AbstractViewGenerator viewGenerator, IEnumerable<IGrouping<int, object>> mappedResultsByBucket, int level, WorkContext context, IStorageActionsAccessor actions, HashSet<string> reduceKeys, int inputCount)
  481. {
  482. this.parent = parent;
  483. this.inputCount = inputCount;
  484. indexId = this.parent.indexId;
  485. ViewGenerator = viewGenerator;
  486. MappedResultsByBucket = mappedResultsByBucket;
  487. Level = level;
  488. Context = context;
  489. Actions = actions;
  490. ReduceKeys = reduceKeys;
  491. anonymousObjectToLuceneDocumentConverter = new AnonymousObjectToLuceneDocumentConverter(this.parent.context.Database, this.parent.indexDefinition, ViewGenerator, logIndexing);
  492. if (Level == 2)
  493. {
  494. batchers = Context.IndexUpdateTriggers.Select(x => x.CreateBatcher(indexId))
  495. .Where(x => x != null)
  496. .ToList();
  497. }
  498. }
  499. public AbstractViewGenerator ViewGenerator { get; private set; }
  500. public IEnumerable<IGrouping<int, object>> MappedResultsByBucket { get; private set; }
  501. public int Level { get; private set; }
  502. public WorkContext Context { get; private set; }
  503. public IStorageActionsAccessor Actions { get; private set; }
  504. public HashSet<string> ReduceKeys { get; private set; }
  505. private string ExtractReduceKey(AbstractViewGenerator viewGenerator, object doc)
  506. {
  507. try
  508. {
  509. object reduceKey = viewGenerator.GroupByExtraction(doc);
  510. if (reduceKey == null)
  511. throw new InvalidOperationException("Could not find reduce key for " + parent.PublicName + " in the result: " + doc);
  512. return ReduceKeyToString(reduceKey);
  513. }
  514. catch (Exception e)
  515. {
  516. throw new InvalidOperationException("Could not extract reduce key from reduce result!", e);
  517. }
  518. }
  519. private IEnumerable<AbstractField> GetFields(object doc, out float boost)
  520. {
  521. boost = 1;
  522. var boostedValue = doc as BoostedValue;
  523. if (boostedValue != null)
  524. {
  525. doc = boostedValue.Value;
  526. boost = boostedValue.Boost;
  527. }
  528. IEnumerable<AbstractField> fields = null;
  529. try
  530. {
  531. var dynamicJsonObject = doc as IDynamicJsonObject;
  532. if (dynamicJsonObject != null)
  533. {
  534. fields = anonymousObjectToLuceneDocumentConverter.Index(dynamicJsonObject.Inner, Field.Store.NO);
  535. }
  536. else
  537. {
  538. var properties = propertyAccessorCache.GetOrAdd(doc.GetType(), PropertyAccessor.Create);
  539. fields = anonymousObjectToLuceneDocumentConverter.Index(doc, properties, Field.Store.NO);
  540. }
  541. }
  542. catch (InvalidShapeException)
  543. {
  544. }
  545. if (Math.Abs(boost - 1) > float.Epsilon)
  546. {
  547. return fields.Select(x => { x.OmitNorms = false; return x; });
  548. }
  549. return fields;
  550. }
  551. private static RavenJObject ToJsonDocument(object doc)
  552. {
  553. var boostedValue = doc as BoostedValue;
  554. if (boostedValue != null)
  555. {
  556. doc = boostedValue.Value;
  557. }
  558. var dynamicJsonObject = doc as IDynamicJsonObject;
  559. if (dynamicJsonObject != null)
  560. {
  561. return dynamicJsonObject.Inner;
  562. }
  563. var ravenJObject = doc as RavenJObject;
  564. if (ravenJObject != null)
  565. return ravenJObject;
  566. var jsonDocument = RavenJObject.FromObject(doc);
  567. MergeArrays(jsonDocument);
  568. // remove _, __, etc fields
  569. foreach (var prop in jsonDocument.Where(x => x.Key.All(ch => ch == '_')).ToArray())
  570. {
  571. jsonDocument.Remove(prop.Key);
  572. }
  573. return jsonDocument;
  574. }
  575. private static void MergeArrays(RavenJToken token)
  576. {
  577. if (token == null)
  578. return;
  579. switch (token.Type)
  580. {
  581. case JTokenType.Array:
  582. var arr = (RavenJArray)token;
  583. for (int i = 0; i < arr.Length; i++)
  584. {
  585. var current = arr[i];
  586. if (current == null || current.Type != JTokenType.Array)
  587. continue;
  588. arr.RemoveAt(i);
  589. i--;
  590. var j = Math.Max(0, i);
  591. foreach (var item in (RavenJArray)current)
  592. {
  593. arr.Insert(j++, item);
  594. }
  595. }
  596. break;
  597. case JTokenType.Object:
  598. foreach (var kvp in ((RavenJObject)token))
  599. {
  600. MergeArrays(kvp.Value);
  601. }
  602. break;
  603. }
  604. }
  605. public IndexingPerformanceStats ExecuteReduction()
  606. {
  607. var count = 0;
  608. var sourceCount = 0;
  609. var addDocumentDuration = new Stopwatch();
  610. var convertToLuceneDocumentDuration = new Stopwatch();
  611. var linqExecutionDuration = new Stopwatch();
  612. var deleteExistingDocumentsDuration = new Stopwatch();
  613. var writeToIndexStats = new List<PerformanceStats>();
  614. IndexingPerformanceStats performance = null;
  615. parent.Write((indexWriter, analyzer, stats) =>
  616. {
  617. stats.Operation = IndexingWorkStats.Status.Reduce;
  618. try
  619. {
  620. if (Level == 2)
  621. {
  622. RemoveExistingReduceKeysFromIndex(indexWriter, deleteExistingDocumentsDuration);
  623. }
  624. foreach (var mappedResults in MappedResultsByBucket)
  625. {
  626. var input = mappedResults.Select(x =>
  627. {
  628. sourceCount++;
  629. return x;
  630. });
  631. IndexingFunc reduceDefinition = ViewGenerator.ReduceDefinition;
  632. foreach (var doc in parent.RobustEnumerationReduce(input.GetEnumerator(), reduceDefinition, stats, linqExecutionDuration))
  633. {
  634. count++;
  635. switch (Level)
  636. {
  637. case 0:
  638. case 1:
  639. string reduceKeyAsString = ExtractReduceKey(ViewGenerator, doc);
  640. Actions.MapReduce.PutReducedResult(indexId, reduceKeyAsString, Level + 1, mappedResults.Key, mappedResults.Key / 1024, ToJsonDocument(doc));
  641. Actions.General.MaybePulseTransaction();
  642. break;
  643. case 2:
  644. WriteDocumentToIndex(doc, indexWriter, analyzer, convertToLuceneDocumentDuration, addDocumentDuration);
  645. break;
  646. default:
  647. throw new InvalidOperationException("Unknown level: " + Level);
  648. }
  649. stats.ReduceSuccesses++;
  650. }
  651. }
  652. }
  653. catch (Exception e)
  654. {
  655. if (Level == 2)
  656. {
  657. batchers.ApplyAndIgnoreAllErrors(
  658. ex =>
  659. {
  660. logIndexing.WarnException("Failed to notify index update trigger batcher about an error in " + parent.PublicName, ex);
  661. Context.AddError(indexId, parent.indexDefinition.Name, null, ex, "AnErrorOccured Trigger");
  662. },
  663. x => x.AnErrorOccured(e));
  664. }
  665. throw;
  666. }
  667. finally
  668. {
  669. if (Level == 2)
  670. {
  671. batchers.ApplyAndIgnoreAllErrors(
  672. e =>
  673. {
  674. logIndexing.WarnException("Failed to dispose on index update trigger in " + parent.PublicName, e);
  675. Context.AddError(indexId, parent.indexDefinition.Name, null, e, "Dispose Trigger");
  676. },
  677. x => x.Dispose());
  678. }
  679. // TODO: Check if we need to report "Bucket Counts" or "Total Input Elements"?
  680. performance = parent.RecordCurrentBatch("Current Reduce #" + Level, "Reduce Level " + Level, sourceCount);
  681. }
  682. return new IndexedItemsInfo(null)
  683. {
  684. ChangedDocs = count + ReduceKeys.Count
  685. };
  686. }, writeToIndexStats);
  687. var performanceStats = new List<BasePerformanceStats>();
  688. performanceStats.Add(PerformanceStats.From(IndexingOperation.Linq_ReduceLinqExecution, linqExecutionDuration.ElapsedMilliseconds));
  689. performanceStats.Add(PerformanceStats.From(IndexingOperation.Lucene_DeleteExistingDocument, deleteExistingDocumentsDuration.ElapsedMilliseconds));
  690. performanceStats.Add(PerformanceStats.From(IndexingOperation.Lucene_ConvertToLuceneDocument, convertToLuceneDocumentDuration.ElapsedMilliseconds));
  691. performanceStats.Add(PerformanceStats.From(IndexingOperation.Lucene_AddDocument, addDocumentDuration.ElapsedMilliseconds));
  692. performanceStats.AddRange(writeToIndexStats);
  693. parent.BatchCompleted("Current Reduce #" + Level, "Reduce Level " + Level, sourceCount, count, performanceStats);
  694. if (logIndexing.IsDebugEnabled)
  695. logIndexing.Debug(() => string.Format("Reduce resulted in {0} entries for {1} for reduce keys at level {3}: {2}", count, parent.PublicName, string.Join(", ", ReduceKeys), Level));
  696. return performance;
  697. }
  698. private void WriteDocumentToIndex(object doc, RavenIndexWriter indexWriter, Analyzer analyzer, Stopwatch convertToLuceneDocumentDuration, Stopwatch addDocumentDutation)
  699. {
  700. string reduceKeyAsString;
  701. using (StopwatchScope.For(convertToLuceneDocumentDuration))
  702. {
  703. float boost;
  704. try
  705. {
  706. var fields = GetFields(doc, out boost);
  707. reduceKeyAsString = ExtractReduceKey(ViewGenerator, doc);
  708. reduceKeyField.SetValue(reduceKeyAsString);
  709. reduceValueField.SetValue(ToJsonDocument(doc).ToString(Formatting.None));
  710. luceneDoc.GetFields().Clear();
  711. luceneDoc.Boost = boost;
  712. luceneDoc.Add(reduceKeyField);
  713. luceneDoc.Add(reduceValueField);
  714. foreach (var field in fields)
  715. luceneDoc.Add(field);
  716. }
  717. catch (Exception e)
  718. {
  719. Context.AddError(indexId,
  720. parent.PublicName,
  721. TryGetDocKey(doc),
  722. e,
  723. "Reduce"
  724. );
  725. logIndexing.WarnException("Could not get fields to during reduce for " + parent.PublicName, e);
  726. return;
  727. }
  728. }
  729. batchers.ApplyAndIgnoreAllErrors(
  730. exception =>
  731. {
  732. logIndexing.WarnException(
  733. string.Format("Error when executed OnIndexEntryCreated trigger for index '{0}', key: '{1}'",
  734. parent.PublicName, reduceKeyAsString),
  735. exception);
  736. Context.AddError(indexId, parent.PublicName, reduceKeyAsString, exception, "OnIndexEntryCreated Trigger");
  737. },
  738. trigger => trigger.OnIndexEntryCreated(reduceKeyAsString, luceneDoc));
  739. parent.LogIndexedDocument(reduceKeyAsString, luceneDoc);
  740. using (StopwatchScope.For(addDocumentDutation))
  741. {
  742. parent.AddDocumentToIndex(indexWriter, luceneDoc, analyzer);
  743. }
  744. }
  745. private void RemoveExistingReduceKeysFromIndex(RavenIndexWriter indexWriter, Stopwatch deleteExistingDocumentsDuration)
  746. {
  747. foreach (var reduceKey in ReduceKeys)
  748. {
  749. var entryKey = reduceKey;
  750. parent.InvokeOnIndexEntryDeletedOnAllBatchers(batchers, new Term(Constants.ReduceKeyFieldName, entryKey));
  751. using (StopwatchScope.For(deleteExistingDocumentsDuration))
  752. {
  753. indexWriter.DeleteDocuments(new Term(Constants.ReduceKeyFieldName, entryKey));
  754. }
  755. }
  756. }
  757. }
  758. }
  759. }