PageRenderTime 43ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/Samples/Source/MapReduce/MapReduceService/MapReduceBlobSet.cs

https://github.com/DIanbi/lokad-cloud
C# | 534 lines | 338 code | 110 blank | 86 comment | 19 complexity | 4dc72a91b8bc7d0a053da374be184911 MD5 | raw file
  1. #region Copyright (c) Lokad 2009-2011
  2. // This code is released under the terms of the new BSD licence.
  3. // URL: http://www.lokad.com/
  4. #endregion
  5. using System;
  6. using System.Collections.Generic;
  7. using Lokad.Cloud.Storage;
  8. namespace Lokad.Cloud.Samples.MapReduce
  9. {
  10. /// <summary>Manages sets of blobs for map/reduce services.</summary>
  11. /// <typeparam name="TMapIn">The type of the items that are input in the map operation.</typeparam>
  12. /// <typeparam name="TMapOut">The type of the items that are output from the map operation.</typeparam>
  13. /// <typeparam name="TReduceOut">The type of the items that are output from the reduce operation.</typeparam>
  14. /// <remarks>All public mebers are thread-safe.</remarks>
  15. /// <seealso cref="MapReduceService"/>
  16. /// <seealso cref="MapReduceJob"/>
  17. public sealed class MapReduceBlobSet
  18. {
  19. /// <summary>The queue used for managing map/reduce work items (<seealso cref="T:BatchMessage"/>).</summary>
  20. internal const string JobsQueueName = "blobsets";
  21. internal const string ContainerName = "blobsets";
  22. internal const string ConfigPrefix = "config";
  23. internal const string InputPrefix = "input";
  24. internal const string ReducedPrefix = "reduced";
  25. internal const string AggregatedPrefix = "aggregated";
  26. internal const string CounterPrefix = "counter";
  27. // Final blob names:
  28. // - blobsets/config/<job-name> -- map/reduce/aggregate functions plus number of queued blobsets -- read-only
  29. // - blobsets/input/<job-name>/<blob-guid> -- read-only
  30. // - blobsets/reduced/<job-name>/<blob-guid> -- write-only
  31. // - blobsets/aggregated/<job-name> -- write-only
  32. // - blobsets/counter/<job-name> -- read/write
  33. IBlobStorageProvider _blobStorage;
  34. IQueueStorageProvider _queueStorage;
  35. /// <summary>Initializes a new instance of the <see cref="T:MapReduceBlobSet"/> generic class.</summary>
  36. /// <param name="blobStorage">The blob storage provider.</param>
  37. /// <param name="queueStorage">The queue storage provider.</param>
  38. public MapReduceBlobSet(IBlobStorageProvider blobStorage, IQueueStorageProvider queueStorage)
  39. {
  40. if(null == blobStorage) throw new ArgumentNullException("blobStorage");
  41. if(null == queueStorage) throw new ArgumentNullException("queueStorage");
  42. _blobStorage = blobStorage;
  43. _queueStorage = queueStorage;
  44. }
  45. Maybe<MapReduceConfiguration> GetJobConfig(string jobName)
  46. {
  47. var configBlobName = MapReduceConfigurationName.Create(jobName);
  48. var config = _blobStorage.GetBlob(configBlobName);
  49. return config;
  50. }
  51. /// <summary>Generates the blob sets that are required to run cloud-based map/reduce operations.</summary>
  52. /// <param name="jobName">The name of the job (should be unique).</param>
  53. /// <param name="items">The items that must be processed (at least two).</param>
  54. /// <param name="functions">The map/reduce/aggregate functions (aggregate is optional).</param>
  55. /// <param name="workerCount">The number of workers to use.</param>
  56. /// <param name="mapIn">The type of the map input.</param>
  57. /// <param name="mapOut">The type of the map output.</param>
  58. /// <remarks>This method should be called from <see cref="T:MapReduceJob"/>.</remarks>
  59. public void GenerateBlobSets(string jobName, IList<object> items, IMapReduceFunctions functions, int workerCount, Type mapIn, Type mapOut)
  60. {
  61. // Note: items is IList and not IEnumerable because the number of items must be known up-front
  62. // 1. Store config
  63. // 2. Put blobs and queue job messages
  64. // 3. Put messages in the work queue
  65. int itemCount = items.Count;
  66. // Note: each blobset should contain at least two elements
  67. int blobSetCount = Math.Min(workerCount, itemCount);
  68. float blobsPerSet = (float)itemCount / (float)blobSetCount;
  69. string ignored;
  70. // 1. Store configuration
  71. var configBlobName = MapReduceConfigurationName.Create(jobName);
  72. var config = new MapReduceConfiguration()
  73. {
  74. TMapInType = mapIn.AssemblyQualifiedName,
  75. TMapOutType = mapOut.AssemblyQualifiedName,
  76. MapReduceFunctionsImplementor = functions.GetType().AssemblyQualifiedName,
  77. BlobSetCount = blobSetCount
  78. };
  79. _blobStorage.PutBlob(configBlobName.ContainerName, configBlobName.ToString(),
  80. config, typeof(MapReduceConfiguration), false, out ignored);
  81. // 2.1. Allocate blobsets
  82. var allNames = new InputBlobName[blobSetCount][];
  83. int processedBlobs = 0;
  84. for(int currSet = 0; currSet < blobSetCount; currSet++)
  85. {
  86. // Last blobset might be smaller
  87. int thisSetSize = currSet != blobSetCount - 1 ? (int)Math.Ceiling(blobsPerSet) : itemCount - processedBlobs;
  88. allNames[currSet] = new InputBlobName[thisSetSize];
  89. processedBlobs += thisSetSize;
  90. }
  91. if(processedBlobs != itemCount)
  92. {
  93. throw new InvalidOperationException("Processed Blobs are less than the number of items");
  94. }
  95. // 2.2. Store input data (separate cycle for clarity)
  96. processedBlobs = 0;
  97. for(int currSet = 0; currSet < blobSetCount; currSet++)
  98. {
  99. for(int i = 0; i < allNames[currSet].Length; i++)
  100. {
  101. // BlobSet and Blob IDs start from zero
  102. allNames[currSet][i] = InputBlobName.Create(jobName, currSet, i);
  103. var item = items[processedBlobs];
  104. _blobStorage.PutBlob(allNames[currSet][i].ContainerName, allNames[currSet][i].ToString(), item, mapIn, false, out ignored);
  105. processedBlobs++;
  106. }
  107. _queueStorage.Put(JobsQueueName, new JobMessage() { Type = MessageType.BlobSetToProcess, JobName = jobName, BlobSetId = currSet });
  108. }
  109. }
  110. private static IMapReduceFunctions GetMapReduceFunctions(string typeName)
  111. {
  112. return Activator.CreateInstance(Type.GetType(typeName)) as IMapReduceFunctions;
  113. }
  114. /// <summary>Performs map/reduce operations on a blobset.</summary>
  115. /// <param name="jobName">The name of the job.</param>
  116. /// <param name="blobSetId">The blobset ID.</param>
  117. /// <remarks>This method should be called from <see cref="T:MapReduceService"/>.</remarks>
  118. public void PerformMapReduce(string jobName, int blobSetId)
  119. {
  120. // 1. Load config
  121. // 2. For all blobs in blobset, do map (output N)
  122. // 3. For all mapped items, do reduce (output 1)
  123. // 4. Store reduce result
  124. // 5. Update counter
  125. // 6. If aggregator != null && blobsets are all processed --> enqueue aggregation message
  126. // 7. Delete blobset
  127. // 1. Load config
  128. var config = GetJobConfig(jobName).Value;
  129. var blobsetPrefix = InputBlobName.GetPrefix(jobName, blobSetId);
  130. var mapResults = new List<object>();
  131. var mapReduceFunctions = GetMapReduceFunctions(config.MapReduceFunctionsImplementor);
  132. var mapIn = Type.GetType(config.TMapInType);
  133. var mapOut = Type.GetType(config.TMapOutType);
  134. // 2. Do map for all blobs in the blobset
  135. string ignored;
  136. foreach (var blobName in _blobStorage.ListBlobNames(blobsetPrefix))
  137. {
  138. var inputBlob = _blobStorage.GetBlob(blobName.ContainerName, blobName.ToString(), mapIn, out ignored);
  139. if (!inputBlob.HasValue)
  140. {
  141. continue;
  142. }
  143. object mapResult = InvokeAsDelegate(mapReduceFunctions.GetMapper(), inputBlob.Value);
  144. mapResults.Add(mapResult);
  145. }
  146. // 3. Do reduce for all mapped results
  147. while (mapResults.Count > 1)
  148. {
  149. object item1 = mapResults[0];
  150. object item2 = mapResults[1];
  151. mapResults.RemoveAt(0);
  152. mapResults.RemoveAt(0);
  153. object reduceResult = InvokeAsDelegate(mapReduceFunctions.GetReducer(), item1, item2);
  154. mapResults.Add(reduceResult);
  155. }
  156. // 4. Store reduced result
  157. var reducedBlobName = ReducedBlobName.Create(jobName, blobSetId);
  158. _blobStorage.PutBlob(reducedBlobName.ContainerName, reducedBlobName.ToString(), mapResults[0], mapOut, false, out ignored);
  159. // 5. Update counter
  160. var counterName = BlobCounterName.Create(jobName);
  161. var counter = new BlobCounter(_blobStorage, counterName);
  162. var totalCompletedBlobSets = (int) counter.Increment(1);
  163. // 6. Queue aggregation if appropriate
  164. if (totalCompletedBlobSets == config.BlobSetCount)
  165. {
  166. _queueStorage.Put(JobsQueueName,
  167. new JobMessage {JobName = jobName, BlobSetId = null, Type = MessageType.ReducedDataToAggregate});
  168. }
  169. // 7. Delete blobset's blobs
  170. _blobStorage.DeleteAllBlobs(blobsetPrefix);
  171. }
  172. /// <summary>Performs the aggregate operation on a blobset.</summary>
  173. /// <param name="jobName">The name of the job.</param>
  174. public void PerformAggregate(string jobName)
  175. {
  176. // 1. Load config
  177. // 2. Do aggregation
  178. // 3. Store result
  179. // 4. Delete reduced data
  180. // 1. Load config
  181. var config = GetJobConfig(jobName).Value;
  182. var reducedBlobPrefix = ReducedBlobName.GetPrefix(jobName);
  183. var aggregateResults = new List<object>();
  184. Type mapOut = Type.GetType(config.TMapOutType);
  185. // 2. Load reduced items and do aggregation
  186. string ignored;
  187. foreach (var blobName in _blobStorage.ListBlobNames(reducedBlobPrefix))
  188. {
  189. var blob = _blobStorage.GetBlob(blobName.ContainerName, blobName.ToString(), mapOut, out ignored);
  190. if(!blob.HasValue)
  191. {
  192. continue;
  193. }
  194. aggregateResults.Add(blob.Value);
  195. }
  196. IMapReduceFunctions mapReduceFunctions = GetMapReduceFunctions(config.MapReduceFunctionsImplementor);
  197. while(aggregateResults.Count > 1)
  198. {
  199. object item1 = aggregateResults[0];
  200. object item2 = aggregateResults[1];
  201. aggregateResults.RemoveAt(0);
  202. aggregateResults.RemoveAt(0);
  203. object aggregResult = InvokeAsDelegate(mapReduceFunctions.GetReducer(), item1, item2);
  204. aggregateResults.Add(aggregResult);
  205. }
  206. // 3. Store aggregated result
  207. var aggregatedBlobName = AggregatedBlobName.Create(jobName);
  208. _blobStorage.PutBlob(aggregatedBlobName.ContainerName, aggregatedBlobName.ToString(), aggregateResults[0], mapOut, false, out ignored);
  209. // 4. Delete reduced data
  210. _blobStorage.DeleteAllBlobs(reducedBlobPrefix);
  211. }
  212. /// <summary>Gets the number of completed blobsets of a job.</summary>
  213. /// <param name="jobName">The name of the job.</param>
  214. /// <returns>The number of completed blobsets (<c>Tuple.Item1</c>) and the total number of blobsets (<c>Tuple.Item2</c>).</returns>
  215. /// <exception cref="ArgumentException">If <paramref name="jobName"/> refers to an inexistent job.</exception>
  216. public System.Tuple<int, int> GetCompletedBlobSets(string jobName)
  217. {
  218. var config = GetJobConfig(jobName);
  219. if (!config.HasValue)
  220. {
  221. throw new ArgumentException("Unknown job", "jobName");
  222. }
  223. var counter = new BlobCounter(_blobStorage, BlobCounterName.Create(jobName));
  224. int completedBlobsets = (int)counter.GetValue();
  225. return new System.Tuple<int, int>(completedBlobsets, config.Value.BlobSetCount);
  226. }
  227. /// <summary>Retrieves the aggregated result of a map/reduce job.</summary>
  228. /// <typeparam name="T">The type of the result.</typeparam>
  229. /// <param name="jobName">The name of the job.</param>
  230. /// <returns>The aggregated result.</returns>
  231. /// <exception cref="InvalidOperationException">If the is not yet complete.</exception>
  232. /// <exception cref="ArgumentException">If <paramref name="jobName"/> refers to an inexistent job.</exception>
  233. public T GetAggregatedResult<T>(string jobName)
  234. {
  235. var config = GetJobConfig(jobName);
  236. if (!config.HasValue)
  237. {
  238. throw new ArgumentException("Unknown job", "jobName");
  239. }
  240. var counter = new BlobCounter(_blobStorage, BlobCounterName.Create(jobName));
  241. int completedBlobsets = (int)counter.GetValue();
  242. if (completedBlobsets < config.Value.BlobSetCount) throw new InvalidOperationException("Job is not complete (there still are blobsets to process)");
  243. Type mapOut = Type.GetType(config.Value.TMapOutType);
  244. var blobName = AggregatedBlobName.Create(jobName);
  245. string ignored;
  246. var aggregatedResult = _blobStorage.GetBlob(blobName.ContainerName, blobName.ToString(), mapOut, out ignored);
  247. if(!aggregatedResult.HasValue)
  248. {
  249. throw new InvalidOperationException("Job is not complete (reduced items must still be aggregated)");
  250. }
  251. return (T) aggregatedResult.Value;
  252. }
  253. /// <summary>Deletes all the data related to a job, regardless of the job status.</summary>
  254. /// <param name="jobName">The name of the job.</param>
  255. /// <remarks>Messages enqueued cannot be deleted but they cause no harm.</remarks>
  256. public void DeleteJobData(string jobName)
  257. {
  258. _blobStorage.DeleteBlobIfExist(MapReduceConfigurationName.Create(jobName));
  259. _blobStorage.DeleteAllBlobs(InputBlobName.GetPrefix(jobName));
  260. _blobStorage.DeleteAllBlobs(ReducedBlobName.GetPrefix(jobName));
  261. _blobStorage.DeleteBlobIfExist(AggregatedBlobName.Create(jobName));
  262. _blobStorage.DeleteBlobIfExist(BlobCounterName.Create(jobName));
  263. }
  264. /// <summary>Gets the existing jobs.</summary>
  265. /// <returns>The names of the existing jobs.</returns>
  266. public IList<string> GetExistingJobs()
  267. {
  268. var names = new List<string>();
  269. foreach (var blobName in _blobStorage.ListBlobNames(MapReduceConfigurationName.GetPrefix()))
  270. {
  271. names.Add(blobName.JobName);
  272. }
  273. return names;
  274. }
  275. #region Delegate Utils
  276. /// <summary>Use reflection to invoke a delegate.</summary>
  277. static object InvokeAsDelegate(object target, params object[] inputs)
  278. {
  279. return target.GetType().InvokeMember(
  280. "Invoke", System.Reflection.BindingFlags.InvokeMethod, null, target, inputs);
  281. }
  282. #endregion
  283. #region Private Classes
  284. /// <summary>Contains configuration for a map/reduce job.</summary>
  285. [Serializable]
  286. public class MapReduceConfiguration
  287. {
  288. /// <summary>The type name of the TMapIn type.</summary>
  289. public string TMapInType { get; set; }
  290. /// <summary>The type name of the TMapOut type.</summary>
  291. public string TMapOutType { get; set; }
  292. /// <summary>The type name of the class that implements <see cref="IMapReduceFunctions"/>.</summary>
  293. public string MapReduceFunctionsImplementor { get; set; }
  294. /// <summary>The number of blobsets to be processed.</summary>
  295. public int BlobSetCount { get; set; }
  296. }
  297. public class MapReduceConfigurationName : BlobName<MapReduceConfiguration>
  298. {
  299. public override string ContainerName
  300. {
  301. get { return MapReduceBlobSet.ContainerName; }
  302. }
  303. [Rank(0)]
  304. public string Prefix;
  305. [Rank(1)]
  306. public string JobName;
  307. public MapReduceConfigurationName(string prefix, string jobName)
  308. {
  309. Prefix = prefix;
  310. JobName = jobName;
  311. }
  312. public static MapReduceConfigurationName Create(string jobName)
  313. {
  314. return new MapReduceConfigurationName(ConfigPrefix, jobName);
  315. }
  316. public static MapReduceConfigurationName GetPrefix()
  317. {
  318. return new MapReduceConfigurationName(ConfigPrefix, null);
  319. }
  320. }
  321. private class InputBlobName : BlobName<object>
  322. {
  323. public override string ContainerName
  324. {
  325. get { return MapReduceBlobSet.ContainerName; }
  326. }
  327. [Rank(0)]
  328. public string Prefix;
  329. [Rank(1)]
  330. public string JobName;
  331. [Rank(2)]
  332. public int? BlobSetId;
  333. [Rank(3)]
  334. public int? BlobId;
  335. public InputBlobName(string prefix, string jobName, int? blobSetId, int? blobId)
  336. {
  337. Prefix = prefix;
  338. JobName = jobName;
  339. BlobSetId = blobSetId;
  340. BlobId = blobId;
  341. }
  342. public static InputBlobName Create(string jobName, int blobSetId, int blobId)
  343. {
  344. return new InputBlobName(InputPrefix, jobName, blobSetId, blobId);
  345. }
  346. public static InputBlobName GetPrefix(string jobName, int blobSetId)
  347. {
  348. return new InputBlobName(InputPrefix, jobName, blobSetId, null);
  349. }
  350. public static InputBlobName GetPrefix(string jobName)
  351. {
  352. return new InputBlobName(InputPrefix, jobName, null, null);
  353. }
  354. }
  355. private class ReducedBlobName : BlobName<object>
  356. {
  357. public override string ContainerName
  358. {
  359. get { return MapReduceBlobSet.ContainerName; }
  360. }
  361. [Rank(0)]
  362. public string Prefix;
  363. [Rank(1)]
  364. public string JobName;
  365. [Rank(2, true)]
  366. public int BlobSetId;
  367. public ReducedBlobName(string prefix, string jobName, int blobSetIt)
  368. {
  369. Prefix = prefix;
  370. JobName = jobName;
  371. BlobSetId = blobSetIt;
  372. }
  373. public static ReducedBlobName Create(string jobName, int blobSetId)
  374. {
  375. return new ReducedBlobName(ReducedPrefix, jobName, blobSetId);
  376. }
  377. public static ReducedBlobName GetPrefix(string jobName)
  378. {
  379. return new ReducedBlobName(ReducedPrefix, jobName, 0);
  380. }
  381. }
  382. private class AggregatedBlobName : BlobName<object>
  383. {
  384. public override string ContainerName
  385. {
  386. get { return MapReduceBlobSet.ContainerName; }
  387. }
  388. [Rank(0)]
  389. public string Prefix;
  390. [Rank(1)]
  391. public string JobName;
  392. public AggregatedBlobName(string prefix, string jobName)
  393. {
  394. Prefix = prefix;
  395. JobName = jobName;
  396. }
  397. public static AggregatedBlobName Create(string jobName)
  398. {
  399. return new AggregatedBlobName(AggregatedPrefix, jobName);
  400. }
  401. }
  402. private class BlobCounterName : BlobName<decimal>
  403. {
  404. public override string ContainerName
  405. {
  406. get { return MapReduceBlobSet.ContainerName; }
  407. }
  408. [Rank(0)]
  409. public string Prefix;
  410. [Rank(1)]
  411. public string JobName;
  412. public BlobCounterName(string prefix, string jobName)
  413. {
  414. Prefix = prefix;
  415. JobName = jobName;
  416. }
  417. public static BlobCounterName Create(string jobName)
  418. {
  419. return new BlobCounterName(CounterPrefix, jobName);
  420. }
  421. }
  422. #endregion
  423. }
  424. }