PageRenderTime 51ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 0ms

/Raven.Database/Counters/Controllers/RavenCounterReplication.cs

https://github.com/nwendel/ravendb
C# | 572 lines | 490 code | 72 blank | 10 comment | 55 complexity | 8718234692797c4db0eddf7a72642f8c MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, BSD-3-Clause, CC-BY-SA-3.0
  1. using System;
  2. using System.Collections.Concurrent;
  3. using System.Collections.Generic;
  4. using System.Diagnostics;
  5. using System.IO;
  6. using System.Linq;
  7. using System.Net;
  8. using System.Threading;
  9. using System.Threading.Tasks;
  10. using Raven.Abstractions;
  11. using Raven.Abstractions.Connection;
  12. using Raven.Abstractions.Counters;
  13. using Raven.Abstractions.Data;
  14. using Raven.Abstractions.Logging;
  15. using Raven.Json.Linq;
  16. namespace Raven.Database.Counters.Controllers
  17. {
  18. public class RavenCounterReplication:IDisposable
  19. {
  20. private static readonly ILog Log = LogManager.GetCurrentClassLogger();
  21. private readonly object waitForCounterUpdate = new object();
  22. private int actualWorkCounter = 0; // represents the number of changes in
  23. private int replicatedWorkCounter = 0; // represents the last actualWorkCounter value that was checked in the last replication iteration
  24. private bool shouldPause = false;
  25. private bool IsRunning { get; set; }
  26. private readonly ConcurrentDictionary<string, CounterDestinationStats> destinationsStats =
  27. new ConcurrentDictionary<string, CounterDestinationStats>(StringComparer.OrdinalIgnoreCase);
  28. private int replicationAttempts;
  29. private readonly ConcurrentDictionary<string, SemaphoreSlim> activeReplicationTasks = new ConcurrentDictionary<string, SemaphoreSlim>();
  30. private readonly ConcurrentQueue<Task> activeTasks = new ConcurrentQueue<Task>();
  31. private HttpRavenRequestFactory httpRavenRequestFactory;
  32. private readonly CounterStorage storage;
  33. private readonly CancellationTokenSource cancellation;
  34. enum ReplicationResult
  35. {
  36. Success = 0,
  37. Failure = 1,
  38. NotReplicated = 2
  39. }
  40. public RavenCounterReplication(CounterStorage storage)
  41. {
  42. this.storage = storage;
  43. this.storage.CounterUpdated += SignalCounterUpdate;
  44. cancellation = new CancellationTokenSource();
  45. }
  46. public void SignalCounterUpdate()
  47. {
  48. lock (waitForCounterUpdate)
  49. {
  50. Interlocked.Increment(ref actualWorkCounter);
  51. Monitor.PulseAll(waitForCounterUpdate);
  52. }
  53. }
  54. public void StartReplication()
  55. {
  56. var replicationTask = new Task(ReplicationAction, TaskCreationOptions.LongRunning);
  57. httpRavenRequestFactory = new HttpRavenRequestFactory { RequestTimeoutInMs = storage.ReplicationTimeoutInMs };
  58. replicationTask.Start();
  59. }
  60. private void ReplicationAction()
  61. {
  62. var runningBecauseOfDataModification = false;
  63. var timeToWaitInMinutes = TimeSpan.FromMinutes(5);
  64. //NotifySiblings(); //TODO: implement
  65. while (!cancellation.IsCancellationRequested)
  66. {
  67. SendReplicationToAllServers(runningBecauseOfDataModification);
  68. runningBecauseOfDataModification = WaitForCountersUpdate(timeToWaitInMinutes);
  69. timeToWaitInMinutes = runningBecauseOfDataModification ? TimeSpan.FromSeconds(30) : TimeSpan.FromMinutes(5);
  70. }
  71. IsRunning = false;
  72. }
  73. private bool WaitForCountersUpdate(TimeSpan timeout)
  74. {
  75. if (Thread.VolatileRead(ref actualWorkCounter) != replicatedWorkCounter)
  76. {
  77. replicatedWorkCounter = actualWorkCounter;
  78. return true;
  79. }
  80. lock (waitForCounterUpdate)
  81. {
  82. if (Thread.VolatileRead(ref actualWorkCounter) != replicatedWorkCounter)
  83. {
  84. replicatedWorkCounter = actualWorkCounter;
  85. return true;
  86. }
  87. Log.Debug("No counter updates for counter storage {0} was found, will wait for updates", storage.CounterStorageUrl);
  88. return Monitor.Wait(waitForCounterUpdate, timeout);
  89. }
  90. }
  91. public void Pause()
  92. {
  93. shouldPause = true;
  94. }
  95. public void Continue()
  96. {
  97. shouldPause = false;
  98. }
  99. public void HandleHeartbeat(string src)
  100. {
  101. ResetFailureForHeartbeat(src);
  102. }
  103. private void ResetFailureForHeartbeat(string src)
  104. {
  105. RecordSuccess(src, lastHeartbeatReceived: SystemTime.UtcNow);
  106. SignalCounterUpdate();
  107. }
  108. private void SendReplicationToAllServers(bool runningBecauseOfDataModifications)
  109. {
  110. IsRunning = !shouldPause;
  111. if (IsRunning)
  112. {
  113. try
  114. {
  115. List<CounterStorageReplicationDestination> replicationDestinations = GetReplicationDestinations();
  116. if (replicationDestinations != null && replicationDestinations.Count > 0)
  117. {
  118. var currentReplicationAttempts = Interlocked.Increment(ref replicationAttempts);
  119. var destinationForReplication = replicationDestinations.Where(
  120. destination => (!runningBecauseOfDataModifications || IsNotFailing(destination.CounterStorageUrl, currentReplicationAttempts)) && !destination.Disabled);
  121. foreach (CounterStorageReplicationDestination destination in destinationForReplication)
  122. {
  123. ReplicateToDestination(destination);
  124. }
  125. }
  126. }
  127. catch (Exception e)
  128. {
  129. Log.ErrorException("Failed to perform replication", e);
  130. }
  131. }
  132. }
  133. private void ReplicateToDestination(CounterStorageReplicationDestination destination)
  134. {
  135. var dest = destination.CounterStorageUrl;
  136. var holder = activeReplicationTasks.GetOrAdd(dest, s => new SemaphoreSlim(1));
  137. if (holder.Wait(0) == false)
  138. return;
  139. var replicationTask = Task.Factory.StartNew(
  140. () =>
  141. {
  142. //using (LogContext.WithDatabase(storage.Name)) //TODO: log with counter storage contexe
  143. //{
  144. try
  145. {
  146. if (ReplicateTo(destination)) SignalCounterUpdate();
  147. }
  148. catch (Exception e)
  149. {
  150. Log.ErrorException("Could not replicate to " + dest, e);
  151. }
  152. //}
  153. });
  154. activeTasks.Enqueue(replicationTask);
  155. replicationTask.ContinueWith(
  156. _ =>
  157. {
  158. // here we purge all the completed tasks at the head of the queue
  159. Task task;
  160. while (activeTasks.TryPeek(out task))
  161. {
  162. if (!task.IsCompleted && !task.IsCanceled && !task.IsFaulted) break;
  163. activeTasks.TryDequeue(out task); // remove it from end
  164. }
  165. });
  166. }
  167. private bool ReplicateTo(CounterStorageReplicationDestination destination)
  168. {
  169. var replicationStopwatch = Stopwatch.StartNew();
  170. //todo: here, build url according to :destination.Url + '/counters/' + destination.
  171. try
  172. {
  173. string lastError;
  174. long lastEtag;
  175. bool result = false;
  176. switch (TryReplicate(destination, out lastEtag, out lastError))
  177. {
  178. case ReplicationResult.Success:
  179. DateTime replicationTime = SystemTime.UtcNow;
  180. RecordSuccess(destination.CounterStorageUrl, lastReplicatedEtag: lastEtag, lastReplicatedLastModified: replicationTime);
  181. storage.MetricsCounters.OutgoingReplications.Mark();
  182. result = true;
  183. break;
  184. case ReplicationResult.NotReplicated:
  185. //TODO: Record not replicated
  186. RecordSuccess(destination.CounterStorageUrl, SystemTime.UtcNow);
  187. break;
  188. default:
  189. RecordFailure(destination.CounterStorageUrl, lastError);
  190. storage.MetricsCounters.OutgoingReplications.Mark();
  191. break;
  192. }
  193. return result;
  194. }
  195. catch (Exception ex)
  196. {
  197. Log.ErrorException("Error occured replicating to: " + destination.CounterStorageUrl, ex);
  198. RecordFailure(destination.CounterStorageUrl, ex.Message);
  199. return false;
  200. }
  201. finally
  202. {
  203. replicationStopwatch.Stop();
  204. storage.MetricsCounters.GetReplicationDurationHistogram(destination.CounterStorageUrl).Update((long)replicationStopwatch.Elapsed.TotalMilliseconds);
  205. storage.MetricsCounters.GetReplicationDurationMetric(destination.CounterStorageUrl).Mark((long)replicationStopwatch.Elapsed.TotalMilliseconds);
  206. var holder = activeReplicationTasks.GetOrAdd(destination.CounterStorageUrl, s => new SemaphoreSlim(0, 1));
  207. holder.Release();
  208. }
  209. }
  210. private ReplicationResult TryReplicate(CounterStorageReplicationDestination destination, out long lastEtagSent, out string lastError)
  211. {
  212. long etag = 0;
  213. lastEtagSent = 0;
  214. var connectionStringOptions = GetConnectionOptionsSafe(destination, out lastError);
  215. if (connectionStringOptions != null && GetLastReplicatedEtagFrom(connectionStringOptions, destination.CounterStorageUrl, out etag, out lastError))
  216. {
  217. var replicationData = GetCountersDataSinceEtag(etag, out lastEtagSent);
  218. storage.MetricsCounters.GetReplicationBatchSizeMetric(destination.CounterStorageUrl).Mark(replicationData.Counters.Count);
  219. storage.MetricsCounters.GetReplicationBatchSizeHistogram(destination.CounterStorageUrl).Update(replicationData.Counters.Count);
  220. if (replicationData.Counters.Count > 0)
  221. {
  222. return PerformReplicationToServer(connectionStringOptions, destination.CounterStorageUrl, etag, replicationData, out lastError) ?
  223. ReplicationResult.Success : ReplicationResult.Failure;
  224. }
  225. return ReplicationResult.NotReplicated;
  226. }
  227. return ReplicationResult.Failure;
  228. }
  229. private bool GetLastReplicatedEtagFrom(RavenConnectionStringOptions connectionStringOptions, string counterStorageUrl, out long lastEtag, out string lastError)
  230. {
  231. if (!TryGetLastReplicatedEtagFrom(connectionStringOptions, counterStorageUrl, out lastEtag, out lastError))
  232. {
  233. if (IsFirstFailure(connectionStringOptions.Url))
  234. {
  235. return TryGetLastReplicatedEtagFrom(connectionStringOptions, counterStorageUrl, out lastEtag, out lastError);
  236. }
  237. return false;
  238. }
  239. return true;
  240. }
  241. private bool TryGetLastReplicatedEtagFrom(RavenConnectionStringOptions connectionStringOptions, string counterStorageUrl, out long lastEtag, out string lastError)
  242. {
  243. lastEtag = 0;
  244. try
  245. {
  246. long etag = 0;
  247. var url = string.Format("{0}/lastEtag?serverUrl={1}", counterStorageUrl, storage.CounterStorageUrl);
  248. var request = httpRavenRequestFactory.Create(url, "GET", connectionStringOptions);
  249. request.ExecuteRequest(etagString => etag = long.Parse(etagString.ReadToEnd()));
  250. lastEtag = etag;
  251. lastError = string.Empty;
  252. return true;
  253. }
  254. catch (WebException e)
  255. {
  256. lastError = HandleReplicationDistributionWebException(e, counterStorageUrl);
  257. return false;
  258. }
  259. catch (Exception e)
  260. {
  261. lastError = e.Message;
  262. return false;
  263. }
  264. }
  265. private bool PerformReplicationToServer(RavenConnectionStringOptions connectionStringOptions, string counterStorageUrl, long etag, ReplicationMessage message, out string lastError)
  266. {
  267. var destinationUrl = connectionStringOptions.Url;
  268. if (!TryPerformReplicationToServer(connectionStringOptions, counterStorageUrl, message, out lastError))
  269. {
  270. if (IsFirstFailure(destinationUrl))
  271. {
  272. return TryPerformReplicationToServer(connectionStringOptions, counterStorageUrl, message, out lastError);
  273. }
  274. return false;
  275. }
  276. return true;
  277. }
  278. private bool TryPerformReplicationToServer(RavenConnectionStringOptions connectionStringOptions, string counterStorageUrl, ReplicationMessage message, out string lastError)
  279. {
  280. try
  281. {
  282. var url = string.Format("{0}/replication", counterStorageUrl);
  283. lastError = string.Empty;
  284. var request = httpRavenRequestFactory.Create(url, "POST", connectionStringOptions);
  285. request.Write(RavenJObject.FromObject(message));
  286. request.ExecuteRequest();
  287. return true;
  288. }
  289. catch (WebException e)
  290. {
  291. lastError = HandleReplicationDistributionWebException(e, counterStorageUrl);
  292. return false;
  293. }
  294. catch (Exception e)
  295. {
  296. Log.ErrorException("Error occured replicating to: " + counterStorageUrl, e);
  297. lastError = e.Message;
  298. return false;
  299. }
  300. }
  301. private List<CounterStorageReplicationDestination> GetReplicationDestinations()
  302. {
  303. CounterStorageReplicationDocument replicationData;
  304. using (var reader = storage.CreateReader())
  305. replicationData = reader.GetReplicationData();
  306. return (replicationData != null) ? replicationData.Destinations : null;
  307. }
  308. private bool IsNotFailing(string destServerName, int currentReplicationAttempts)
  309. {
  310. CounterDestinationStats destinationStats;
  311. if (destinationsStats.TryGetValue(destServerName, out destinationStats) && destinationStats.FailureCount > 10)
  312. {
  313. bool shouldReplicateTo = false;
  314. var failureCount = destinationStats.FailureCount;
  315. if (failureCount > 1000)
  316. {
  317. shouldReplicateTo = currentReplicationAttempts%10 == 0;
  318. }
  319. if (failureCount > 100)
  320. {
  321. shouldReplicateTo = currentReplicationAttempts%5 == 0;
  322. }
  323. if (failureCount > 10)
  324. {
  325. shouldReplicateTo = currentReplicationAttempts%2 == 0;
  326. }
  327. Log.Debug("Failure count for {0} is {1}, skipping replication: {2}",
  328. destServerName, failureCount, shouldReplicateTo == false);
  329. return shouldReplicateTo;
  330. }
  331. return true;
  332. }
  333. private ReplicationMessage GetCountersDataSinceEtag(long etag, out long lastEtagSent)
  334. {
  335. var message = new ReplicationMessage { SendingServerName = storage.CounterStorageUrl };
  336. using (var reader = storage.CreateReader())
  337. {
  338. message.Counters = reader.GetCountersSinceEtag(etag + 1).Take(10240).ToList(); //TODO: Capped this...how to get remaining values?
  339. lastEtagSent = message.Counters.Count > 0 ? message.Counters.Max(x=>x.Etag):etag; // change this once changed this function do a reall paging
  340. }
  341. return message;
  342. }
  343. private RavenConnectionStringOptions GetConnectionOptionsSafe(CounterStorageReplicationDestination destination, out string lastError)
  344. {
  345. try
  346. {
  347. var connectionStringOptions = new RavenConnectionStringOptions
  348. {
  349. Url = destination.ServerUrl,
  350. ApiKey = destination.ApiKey,
  351. };
  352. if (string.IsNullOrEmpty(destination.Username) == false)
  353. {
  354. connectionStringOptions.Credentials = string.IsNullOrEmpty(destination.Domain)
  355. ? new NetworkCredential(destination.Username, destination.Password)
  356. : new NetworkCredential(destination.Username, destination.Password, destination.Domain);
  357. }
  358. lastError = string.Empty;
  359. return connectionStringOptions;
  360. }
  361. catch (Exception e)
  362. {
  363. lastError = e.Message;
  364. Log.ErrorException(string.Format("Ignoring bad replication config!{0}Could not figure out connection options for [Url: {1}]",
  365. Environment.NewLine, destination.ServerUrl), e);
  366. return null;
  367. }
  368. }
  369. private bool IsFirstFailure(string destinationUrl)
  370. {
  371. var destStats = destinationsStats.GetOrAdd(destinationUrl, new CounterDestinationStats { Url = destinationUrl });
  372. return destStats.FailureCount == 0;
  373. }
  374. //Notifies servers which send us counters that we are back online
  375. private void NotifySiblings() //TODO: implement
  376. {
  377. var notifications = new BlockingCollection<RavenConnectionStringOptions>();
  378. Task.Factory.StartNew(() => NotifySibling(notifications));
  379. var replicationDestinations = GetReplicationDestinations();
  380. foreach (var replicationDestination in replicationDestinations)
  381. {
  382. string lastError;
  383. notifications.TryAdd(GetConnectionOptionsSafe(replicationDestination, out lastError), 15 * 1000);
  384. }
  385. //TODO: add to notifications to the source server, the servers we get the replications from
  386. }
  387. private void NotifySibling(BlockingCollection<RavenConnectionStringOptions> collection)
  388. {
  389. // using (LogContext.WithDatabase(docDb.Name)) todo:implement log context
  390. while (true)
  391. {
  392. RavenConnectionStringOptions connectionStringOptions;
  393. try
  394. {
  395. collection.TryTake(out connectionStringOptions, 15 * 1000, cancellation.Token);
  396. if (connectionStringOptions == null)
  397. return;
  398. }
  399. catch (Exception e)
  400. {
  401. Log.ErrorException("Could not get connection string options to notify sibling servers about restart", e);
  402. return;
  403. }
  404. try
  405. {
  406. var url = connectionStringOptions.Url + "/counters/" + storage.Name + "/replication/heartbeat?from=" + Uri.EscapeDataString(storage.CounterStorageUrl);
  407. var request = httpRavenRequestFactory.Create(url, "POST", connectionStringOptions);
  408. request.WebRequest.ContentLength = 0;
  409. request.ExecuteRequest();
  410. }
  411. catch (Exception e)
  412. {
  413. Log.WarnException("Could not notify " + connectionStringOptions.Url + " about sibling server being up & running", e);
  414. }
  415. }
  416. }
  417. private void RecordSuccess(string url,
  418. DateTime? lastSuccessTimestamp = null,
  419. long? lastReplicatedEtag = null,
  420. DateTime? lastReplicatedLastModified = null,
  421. DateTime? lastHeartbeatReceived = null, string lastError = null)
  422. {
  423. var stats = destinationsStats.GetOrAdd(url, new CounterDestinationStats { Url = url });
  424. Interlocked.Exchange(ref stats.FailureCountInternal, 0);
  425. if (lastSuccessTimestamp.HasValue)
  426. {
  427. stats.LastSuccessTimestamp = lastSuccessTimestamp.Value;
  428. }
  429. if (lastReplicatedEtag.HasValue)
  430. {
  431. stats.LastReplicatedEtag = lastReplicatedEtag.Value;
  432. }
  433. if (lastReplicatedLastModified.HasValue)
  434. stats.LastSuccessTimestamp = stats.LastReplicatedLastModified = lastReplicatedLastModified;
  435. if (lastHeartbeatReceived.HasValue)
  436. {
  437. stats.LastHeartbeatReceived = lastHeartbeatReceived;
  438. }
  439. else
  440. {
  441. stats.LastHeartbeatReceived = SystemTime.UtcNow;
  442. }
  443. if (!string.IsNullOrWhiteSpace(lastError))
  444. stats.LastError = lastError;
  445. }
  446. private void RecordFailure(string url, string lastError)
  447. {
  448. var stats = destinationsStats.GetOrAdd(url, new CounterDestinationStats { Url = url });
  449. Interlocked.Increment(ref stats.FailureCountInternal);
  450. stats.LastFailureTimestamp = SystemTime.UtcNow;
  451. if (string.IsNullOrWhiteSpace(lastError) == false)
  452. {
  453. stats.LastError = lastError;
  454. }
  455. }
  456. private string HandleReplicationDistributionWebException(WebException e, string destinationUrl)
  457. {
  458. var response = e.Response as HttpWebResponse;
  459. if (response != null)
  460. {
  461. Stream responseStream = response.GetResponseStream();
  462. if (responseStream != null)
  463. {
  464. using (var streamReader = new StreamReader(responseStream))
  465. {
  466. var error = streamReader.ReadToEnd();
  467. Log.WarnException("Replication to " + destinationUrl + " had failed\r\n" + error, e);
  468. }
  469. }
  470. else
  471. {
  472. Log.WarnException("Replication to " + destinationUrl + " had failed", e);
  473. }
  474. }
  475. else
  476. {
  477. Log.WarnException("Replication to " + destinationUrl + " had failed", e);
  478. }
  479. return e.Message;
  480. }
  481. public int GetActiveTasksCount()
  482. {
  483. return activeTasks.Count;
  484. }
  485. public ConcurrentDictionary<string, CounterDestinationStats> DestinationStats
  486. {
  487. get { return destinationsStats; }
  488. }
  489. public void Dispose()
  490. {
  491. Task task;
  492. cancellation.Cancel();
  493. SignalCounterUpdate();
  494. while (activeTasks.TryDequeue(out task))
  495. {
  496. task.Wait();
  497. }
  498. }
  499. }
  500. }