PageRenderTime 38ms CodeModel.GetById 12ms app.highlight 20ms RepoModel.GetById 1ms app.codeStats 0ms

/Raven.Database/Counters/Controllers/RavenCounterReplication.cs

https://github.com/nwendel/ravendb
C# | 572 lines | 490 code | 72 blank | 10 comment | 55 complexity | 8718234692797c4db0eddf7a72642f8c MD5 | raw file
  1using System;
  2using System.Collections.Concurrent;
  3using System.Collections.Generic;
  4using System.Diagnostics;
  5using System.IO;
  6using System.Linq;
  7using System.Net;
  8using System.Threading;
  9using System.Threading.Tasks;
 10using Raven.Abstractions;
 11using Raven.Abstractions.Connection;
 12using Raven.Abstractions.Counters;
 13using Raven.Abstractions.Data;
 14using Raven.Abstractions.Logging;
 15using Raven.Json.Linq;
 16
 17namespace Raven.Database.Counters.Controllers
 18{
 19	public class RavenCounterReplication:IDisposable
 20	{
 21        private static readonly ILog Log = LogManager.GetCurrentClassLogger();
 22
 23        private readonly object waitForCounterUpdate = new object();
 24        private int actualWorkCounter = 0; // represents the number of changes in 
 25        private int replicatedWorkCounter = 0; // represents the last actualWorkCounter value that was checked in the last replication iteration
 26        private bool shouldPause = false;
 27        private bool IsRunning { get; set; }
 28        private readonly ConcurrentDictionary<string, CounterDestinationStats> destinationsStats =
 29            new ConcurrentDictionary<string, CounterDestinationStats>(StringComparer.OrdinalIgnoreCase);
 30        private int replicationAttempts;
 31        private readonly ConcurrentDictionary<string, SemaphoreSlim> activeReplicationTasks = new ConcurrentDictionary<string, SemaphoreSlim>();
 32        private readonly ConcurrentQueue<Task> activeTasks = new ConcurrentQueue<Task>();
 33        private HttpRavenRequestFactory httpRavenRequestFactory;
 34
 35		private readonly CounterStorage storage;
 36		private readonly CancellationTokenSource cancellation;
 37
 38        
 39
 40		enum ReplicationResult
 41		{
 42			Success = 0,
 43			Failure = 1,
 44			NotReplicated = 2
 45		}
 46
 47		public RavenCounterReplication(CounterStorage storage)
 48		{
 49			this.storage = storage;
 50			this.storage.CounterUpdated += SignalCounterUpdate;
 51			cancellation = new CancellationTokenSource();
 52		}
 53
 54		public void SignalCounterUpdate()
 55		{
 56			lock (waitForCounterUpdate)
 57			{
 58				Interlocked.Increment(ref actualWorkCounter);
 59				Monitor.PulseAll(waitForCounterUpdate);
 60			}
 61		}
 62
 63	    public void StartReplication()
 64	    {
 65            var replicationTask = new Task(ReplicationAction, TaskCreationOptions.LongRunning);
 66
 67            httpRavenRequestFactory = new HttpRavenRequestFactory { RequestTimeoutInMs = storage.ReplicationTimeoutInMs };
 68            replicationTask.Start();
 69	    }
 70
 71	    private void ReplicationAction()
 72	    {
 73	        var runningBecauseOfDataModification = false;
 74            var timeToWaitInMinutes = TimeSpan.FromMinutes(5);
 75
 76            //NotifySiblings(); //TODO: implement
 77
 78            while (!cancellation.IsCancellationRequested)
 79            {
 80                SendReplicationToAllServers(runningBecauseOfDataModification);
 81                runningBecauseOfDataModification = WaitForCountersUpdate(timeToWaitInMinutes);
 82                timeToWaitInMinutes = runningBecauseOfDataModification ? TimeSpan.FromSeconds(30) : TimeSpan.FromMinutes(5);
 83            }
 84
 85	        IsRunning = false;
 86	    }
 87
 88		private bool WaitForCountersUpdate(TimeSpan timeout)
 89		{
 90			if (Thread.VolatileRead(ref actualWorkCounter) != replicatedWorkCounter)
 91			{
 92				replicatedWorkCounter = actualWorkCounter;
 93				return true;
 94			}
 95			lock (waitForCounterUpdate)
 96			{
 97				if (Thread.VolatileRead(ref actualWorkCounter) != replicatedWorkCounter)
 98				{
 99					replicatedWorkCounter = actualWorkCounter;
100					return true;
101				}
102
103				Log.Debug("No counter updates for counter storage {0} was found, will wait for updates", storage.CounterStorageUrl);
104				return Monitor.Wait(waitForCounterUpdate, timeout);
105			}
106		}
107
108		public void Pause()
109		{
110			shouldPause = true;
111		}
112
113		public void Continue()
114		{
115			shouldPause = false;
116		}
117
118		public void HandleHeartbeat(string src)
119		{
120			ResetFailureForHeartbeat(src);
121		}
122
123		private void ResetFailureForHeartbeat(string src)
124		{
125			RecordSuccess(src, lastHeartbeatReceived: SystemTime.UtcNow);
126			SignalCounterUpdate();
127		}
128
129		private void SendReplicationToAllServers(bool runningBecauseOfDataModifications)
130		{
131			IsRunning = !shouldPause;
132			if (IsRunning)
133			{
134				try
135				{
136					List<CounterStorageReplicationDestination> replicationDestinations = GetReplicationDestinations();
137
138					if (replicationDestinations != null && replicationDestinations.Count > 0)
139					{
140						var currentReplicationAttempts = Interlocked.Increment(ref replicationAttempts);
141
142						var destinationForReplication = replicationDestinations.Where(
143							destination => (!runningBecauseOfDataModifications || IsNotFailing(destination.CounterStorageUrl, currentReplicationAttempts)) && !destination.Disabled);
144
145						foreach (CounterStorageReplicationDestination destination in destinationForReplication)
146						{
147							ReplicateToDestination(destination);
148						}
149					}
150				}
151				catch (Exception e)
152				{
153					Log.ErrorException("Failed to perform replication", e);
154				}
155			}
156		}
157
158		private void ReplicateToDestination(CounterStorageReplicationDestination destination)
159		{
160			var dest = destination.CounterStorageUrl;
161			var holder = activeReplicationTasks.GetOrAdd(dest, s => new SemaphoreSlim(1));
162			if (holder.Wait(0) == false)
163				return;
164			var replicationTask = Task.Factory.StartNew(
165				() =>
166				{
167					//using (LogContext.WithDatabase(storage.Name)) //TODO: log with counter storage contexe
168					//{
169					try
170					{
171						if (ReplicateTo(destination)) SignalCounterUpdate();
172					}
173					catch (Exception e)
174					{
175						Log.ErrorException("Could not replicate to " + dest, e);
176					}
177					//}
178				});
179
180			activeTasks.Enqueue(replicationTask);
181			replicationTask.ContinueWith(
182				_ =>
183				{
184					// here we purge all the completed tasks at the head of the queue
185					Task task;
186					while (activeTasks.TryPeek(out task))
187					{
188						if (!task.IsCompleted && !task.IsCanceled && !task.IsFaulted) break;
189						activeTasks.TryDequeue(out task); // remove it from end
190					}
191				});
192		}
193
194		private bool ReplicateTo(CounterStorageReplicationDestination destination)
195		{
196            var replicationStopwatch = Stopwatch.StartNew();
197			//todo: here, build url according to :destination.Url + '/counters/' + destination.
198			try
199			{
200				string lastError;
201			    long lastEtag;
202				bool result = false;
203
204				switch (TryReplicate(destination, out lastEtag, out lastError))
205				{
206					case ReplicationResult.Success:
207                        DateTime replicationTime = SystemTime.UtcNow;
208                        RecordSuccess(destination.CounterStorageUrl, lastReplicatedEtag: lastEtag, lastReplicatedLastModified: replicationTime);
209                        storage.MetricsCounters.OutgoingReplications.Mark();
210						result = true;
211						break;
212					case ReplicationResult.NotReplicated:
213						//TODO: Record not replicated
214                        RecordSuccess(destination.CounterStorageUrl, SystemTime.UtcNow);
215						break;
216					default:
217						RecordFailure(destination.CounterStorageUrl, lastError);
218                        storage.MetricsCounters.OutgoingReplications.Mark();
219						break;
220				}
221
222				return result;
223			}
224			catch (Exception ex)
225			{
226				Log.ErrorException("Error occured replicating to: " + destination.CounterStorageUrl, ex);
227				RecordFailure(destination.CounterStorageUrl, ex.Message);
228				return false;
229			}
230			finally
231			{
232                replicationStopwatch.Stop();
233                storage.MetricsCounters.GetReplicationDurationHistogram(destination.CounterStorageUrl).Update((long)replicationStopwatch.Elapsed.TotalMilliseconds);
234                storage.MetricsCounters.GetReplicationDurationMetric(destination.CounterStorageUrl).Mark((long)replicationStopwatch.Elapsed.TotalMilliseconds);
235				var holder = activeReplicationTasks.GetOrAdd(destination.CounterStorageUrl, s => new SemaphoreSlim(0, 1));
236				holder.Release();
237			}
238		}
239
240		private ReplicationResult TryReplicate(CounterStorageReplicationDestination destination, out long lastEtagSent, out string lastError)
241		{
242            long etag = 0;
243		    lastEtagSent = 0;
244			var connectionStringOptions = GetConnectionOptionsSafe(destination, out lastError);
245            
246            if (connectionStringOptions != null && GetLastReplicatedEtagFrom(connectionStringOptions, destination.CounterStorageUrl, out etag, out lastError))
247			{
248                var replicationData = GetCountersDataSinceEtag(etag, out lastEtagSent);
249                
250                storage.MetricsCounters.GetReplicationBatchSizeMetric(destination.CounterStorageUrl).Mark(replicationData.Counters.Count);
251                storage.MetricsCounters.GetReplicationBatchSizeHistogram(destination.CounterStorageUrl).Update(replicationData.Counters.Count);
252
253				if (replicationData.Counters.Count > 0)
254				{
255                    return PerformReplicationToServer(connectionStringOptions, destination.CounterStorageUrl, etag, replicationData, out lastError) ?
256						ReplicationResult.Success : ReplicationResult.Failure;
257				}
258
259				return ReplicationResult.NotReplicated;
260			}
261
262			return ReplicationResult.Failure;
263		}
264
265		private bool GetLastReplicatedEtagFrom(RavenConnectionStringOptions connectionStringOptions, string counterStorageUrl, out long lastEtag, out string lastError)
266		{
267			if (!TryGetLastReplicatedEtagFrom(connectionStringOptions, counterStorageUrl, out lastEtag, out lastError))
268			{
269				if (IsFirstFailure(connectionStringOptions.Url))
270				{
271					return TryGetLastReplicatedEtagFrom(connectionStringOptions, counterStorageUrl, out lastEtag, out lastError);
272				}
273				return false;
274			}
275
276			return true;
277		}
278
279		private bool TryGetLastReplicatedEtagFrom(RavenConnectionStringOptions connectionStringOptions, string counterStorageUrl, out long lastEtag, out string lastError)
280		{
281			lastEtag = 0;
282			try
283			{
284				long etag = 0;
285				var url = string.Format("{0}/lastEtag?serverUrl={1}", counterStorageUrl, storage.CounterStorageUrl);
286				var request = httpRavenRequestFactory.Create(url, "GET", connectionStringOptions);
287				request.ExecuteRequest(etagString => etag = long.Parse(etagString.ReadToEnd()));
288
289				lastEtag = etag;
290				lastError = string.Empty;
291				return true;
292			}
293			catch (WebException e)
294			{
295				lastError = HandleReplicationDistributionWebException(e, counterStorageUrl);
296				return false;
297			}
298			catch (Exception e)
299			{
300				lastError = e.Message;
301				return false;
302			}
303		}
304
305		private bool PerformReplicationToServer(RavenConnectionStringOptions connectionStringOptions, string counterStorageUrl, long etag, ReplicationMessage message, out string lastError)
306		{
307			var destinationUrl = connectionStringOptions.Url;
308
309			if (!TryPerformReplicationToServer(connectionStringOptions, counterStorageUrl, message, out lastError))
310			{
311				if (IsFirstFailure(destinationUrl))
312				{
313					return TryPerformReplicationToServer(connectionStringOptions, counterStorageUrl, message, out lastError);
314				}
315				return false;
316			}
317
318			return true;
319		}
320
321		private bool TryPerformReplicationToServer(RavenConnectionStringOptions connectionStringOptions, string counterStorageUrl, ReplicationMessage message, out string lastError)
322		{
323			try
324			{
325				var url = string.Format("{0}/replication", counterStorageUrl);
326				lastError = string.Empty;
327				var request = httpRavenRequestFactory.Create(url, "POST", connectionStringOptions);
328				request.Write(RavenJObject.FromObject(message));
329				request.ExecuteRequest();
330                
331                return true;
332
333			}
334			catch (WebException e)
335			{
336				lastError = HandleReplicationDistributionWebException(e, counterStorageUrl);
337				return false;
338			}
339			catch (Exception e)
340			{
341				Log.ErrorException("Error occured replicating to: " + counterStorageUrl, e);
342				lastError = e.Message;
343				return false;
344			}
345		}
346
347		private List<CounterStorageReplicationDestination> GetReplicationDestinations()
348		{
349			CounterStorageReplicationDocument replicationData;
350			using (var reader = storage.CreateReader())
351				replicationData = reader.GetReplicationData();
352			return (replicationData != null) ? replicationData.Destinations : null;
353		}
354		
355		private bool IsNotFailing(string destServerName, int currentReplicationAttempts)
356        {
357            CounterDestinationStats destinationStats;
358            if (destinationsStats.TryGetValue(destServerName, out destinationStats) && destinationStats.FailureCount > 10)
359			{
360				bool shouldReplicateTo = false;
361				var failureCount = destinationStats.FailureCount;
362
363			    if (failureCount > 1000)
364			    {
365			        shouldReplicateTo = currentReplicationAttempts%10 == 0;
366			    }
367			    if (failureCount > 100)
368			    {
369			        shouldReplicateTo = currentReplicationAttempts%5 == 0;
370			    }
371			    if (failureCount > 10)
372			    {
373			        shouldReplicateTo = currentReplicationAttempts%2 == 0;
374			    }
375			    Log.Debug("Failure count for {0} is {1}, skipping replication: {2}",
376			        destServerName, failureCount, shouldReplicateTo == false);
377			    return shouldReplicateTo;
378	        }
379			return true;
380        }
381
382	    private ReplicationMessage GetCountersDataSinceEtag(long etag, out long lastEtagSent)
383	    {
384            var message = new ReplicationMessage { SendingServerName = storage.CounterStorageUrl };
385
386            using (var reader = storage.CreateReader())
387            {
388                message.Counters = reader.GetCountersSinceEtag(etag + 1).Take(10240).ToList(); //TODO: Capped this...how to get remaining values?
389                lastEtagSent = message.Counters.Count > 0 ? message.Counters.Max(x=>x.Etag):etag; // change this once changed this function do a reall paging
390            }
391
392	        return message;
393	    }
394
395		private RavenConnectionStringOptions GetConnectionOptionsSafe(CounterStorageReplicationDestination destination, out string lastError)
396		{
397			try
398			{
399				var connectionStringOptions = new RavenConnectionStringOptions
400				{
401                    Url = destination.ServerUrl,
402					ApiKey = destination.ApiKey,
403				};
404				if (string.IsNullOrEmpty(destination.Username) == false)
405				{
406					connectionStringOptions.Credentials = string.IsNullOrEmpty(destination.Domain)
407						? new NetworkCredential(destination.Username, destination.Password)
408						: new NetworkCredential(destination.Username, destination.Password, destination.Domain);
409				}
410				lastError = string.Empty;
411				return connectionStringOptions;
412			}
413			catch (Exception e)
414			{
415				lastError = e.Message;
416				Log.ErrorException(string.Format("Ignoring bad replication config!{0}Could not figure out connection options for [Url: {1}]",
417                    Environment.NewLine, destination.ServerUrl), e);
418				return null;
419			}
420		}
421
422        private bool IsFirstFailure(string destinationUrl)
423        {
424            var destStats = destinationsStats.GetOrAdd(destinationUrl, new CounterDestinationStats { Url = destinationUrl });
425            return destStats.FailureCount == 0;
426        }
427
428		//Notifies servers which send us counters that we are back online
429		private void NotifySiblings() //TODO: implement
430		{
431			var notifications = new BlockingCollection<RavenConnectionStringOptions>();
432
433			Task.Factory.StartNew(() => NotifySibling(notifications));
434
435			var replicationDestinations = GetReplicationDestinations();
436			foreach (var replicationDestination in replicationDestinations)
437			{
438				string lastError;
439				notifications.TryAdd(GetConnectionOptionsSafe(replicationDestination, out lastError), 15 * 1000);
440			}
441
442			//TODO: add to notifications to the source server, the servers we get the replications from
443		}
444
445		private void NotifySibling(BlockingCollection<RavenConnectionStringOptions> collection)
446		{
447			// using (LogContext.WithDatabase(docDb.Name)) todo:implement log context
448			while (true)
449			{
450				RavenConnectionStringOptions connectionStringOptions;
451				try
452				{
453					collection.TryTake(out connectionStringOptions, 15 * 1000, cancellation.Token);
454					if (connectionStringOptions == null)
455						return;
456				}
457				catch (Exception e)
458				{
459					Log.ErrorException("Could not get connection string options to notify sibling servers about restart", e);
460					return;
461				}
462				try
463				{
464					var url = connectionStringOptions.Url + "/counters/" + storage.Name + "/replication/heartbeat?from=" + Uri.EscapeDataString(storage.CounterStorageUrl);
465					var request = httpRavenRequestFactory.Create(url, "POST", connectionStringOptions);
466					request.WebRequest.ContentLength = 0;
467					request.ExecuteRequest();
468				}
469				catch (Exception e)
470				{
471					Log.WarnException("Could not notify " + connectionStringOptions.Url + " about sibling server being up & running", e);
472				}
473			}
474		}
475        
476        private void RecordSuccess(string url,
477            DateTime? lastSuccessTimestamp = null, 
478            long? lastReplicatedEtag = null,
479            DateTime? lastReplicatedLastModified = null,
480            DateTime? lastHeartbeatReceived = null, string lastError = null)
481        {
482            var stats = destinationsStats.GetOrAdd(url, new CounterDestinationStats { Url = url });
483            Interlocked.Exchange(ref stats.FailureCountInternal, 0);
484
485            if (lastSuccessTimestamp.HasValue)
486            {
487                stats.LastSuccessTimestamp = lastSuccessTimestamp.Value;
488            }
489
490            if (lastReplicatedEtag.HasValue)
491            {
492                stats.LastReplicatedEtag = lastReplicatedEtag.Value;
493            }
494
495            if (lastReplicatedLastModified.HasValue)
496                stats.LastSuccessTimestamp = stats.LastReplicatedLastModified = lastReplicatedLastModified;
497
498            if (lastHeartbeatReceived.HasValue)
499            {
500                stats.LastHeartbeatReceived = lastHeartbeatReceived;
501            }
502            else
503            {
504                stats.LastHeartbeatReceived = SystemTime.UtcNow;
505            }
506
507            if (!string.IsNullOrWhiteSpace(lastError))
508                stats.LastError = lastError;
509        }
510
511		private void RecordFailure(string url, string lastError)
512		{
513			var stats = destinationsStats.GetOrAdd(url, new CounterDestinationStats { Url = url });
514			Interlocked.Increment(ref stats.FailureCountInternal);
515			stats.LastFailureTimestamp = SystemTime.UtcNow;
516			if (string.IsNullOrWhiteSpace(lastError) == false)
517			{
518				stats.LastError = lastError;
519			}
520		}
521
522		private string HandleReplicationDistributionWebException(WebException e, string destinationUrl)
523		{
524			var response = e.Response as HttpWebResponse;
525			if (response != null)
526			{
527				Stream responseStream = response.GetResponseStream();
528				if (responseStream != null)
529				{
530					using (var streamReader = new StreamReader(responseStream))
531					{
532						var error = streamReader.ReadToEnd();
533						Log.WarnException("Replication to " + destinationUrl + " had failed\r\n" + error, e);
534					}
535				}
536				else
537				{
538					Log.WarnException("Replication to " + destinationUrl + " had failed", e);
539				}
540			}
541			else
542			{
543				Log.WarnException("Replication to " + destinationUrl + " had failed", e);
544			}
545
546			return e.Message;
547		}
548
549	    public int GetActiveTasksCount()
550	    {
551	        return activeTasks.Count;
552	    }
553
554        public ConcurrentDictionary<string, CounterDestinationStats> DestinationStats
555        {
556            get { return destinationsStats; }
557        }
558
559		public void Dispose()
560        {
561            Task task;
562            cancellation.Cancel();
563            SignalCounterUpdate();
564
565            while (activeTasks.TryDequeue(out task))
566            {
567                task.Wait();
568            }
569        }
570    }
571    
572}