PageRenderTime 106ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/src/java/org/apache/cassandra/utils/DiagnosticSnapshotService.java

https://github.com/beobal/cassandra
Java | 199 lines | 136 code | 22 blank | 41 comment | 7 complexity | ec9cda98b108a3ecc557d7733a35ed42 MD5 | raw file
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.cassandra.utils;
  19. import java.net.InetAddress;
  20. import java.time.LocalDate;
  21. import java.time.format.DateTimeFormatter;
  22. import java.util.concurrent.*;
  23. import java.util.concurrent.atomic.AtomicLong;
  24. import com.google.common.annotations.VisibleForTesting;
  25. import com.google.common.base.Preconditions;
  26. import org.slf4j.Logger;
  27. import org.slf4j.LoggerFactory;
  28. import org.apache.cassandra.concurrent.NamedThreadFactory;
  29. import org.apache.cassandra.db.*;
  30. import org.apache.cassandra.locator.InetAddressAndPort;
  31. import org.apache.cassandra.net.Message;
  32. import org.apache.cassandra.net.MessagingService;
  33. import org.apache.cassandra.net.Verb;
  34. import org.apache.cassandra.schema.TableId;
  35. import org.apache.cassandra.schema.TableMetadata;
  36. import org.hsqldb.Table;
  37. /**
  38. * Provides a means to take snapshots when triggered by anomalous events or when the breaking of invariants is
  39. * detected. When debugging certain classes of problems, having access to the relevant set of sstables when the problem
  40. * is detected (or as close to then as possible) can be invaluable.
  41. *
  42. * This class performs two functions; on a replica where an anomaly is detected, it provides methods to issue snapshot
  43. * requests to a provided set of replicas. For instance, if rows with duplicate clusterings are detected
  44. * (CASSANDRA-15789) during a read, a snapshot request will be issued to all participating replicas. If detected during
  45. * compaction, only the replica itself will receive the request. Requests are issued at a maximum rate of 1 per minute
  46. * for any given table. Any additional triggers for the same table during the 60 second window are dropped, regardless
  47. * of the replica set. This window is configurable via a system property (cassandra.diagnostic_snapshot_interval_nanos),
  48. * but this is intended for use in testing only and operators are not expected to override the default.
  49. *
  50. * The second function performed is to handle snapshot requests on replicas. Snapshot names are prefixed with strings
  51. * specific to the reason which triggered them. To manage consumption of disk space, replicas are restricted to taking
  52. * a single snapshot for each prefix in a single calendar day. So if duplicate rows are detected by multiple
  53. * coordinators during reads with the same replica set (or overlapping sets) on the same table, the coordinators may
  54. * each issue snapshot requests, but the replicas will only accept the first one they receive. Further requests will
  55. * be dropped on the replica side.
  56. */
  57. public class DiagnosticSnapshotService
  58. {
  59. private static final Logger logger = LoggerFactory.getLogger(DiagnosticSnapshotService.class);
  60. public static final DiagnosticSnapshotService instance =
  61. new DiagnosticSnapshotService(Executors.newSingleThreadExecutor(new NamedThreadFactory("DiagnosticSnapshot")));
  62. public static final String REPAIRED_DATA_MISMATCH_SNAPSHOT_PREFIX = "RepairedDataMismatch-";
  63. public static final String DUPLICATE_ROWS_DETECTED_SNAPSHOT_PREFIX = "DuplicateRows-";
  64. private final Executor executor;
  65. private DiagnosticSnapshotService(Executor executor)
  66. {
  67. this.executor = executor;
  68. }
  69. // Issue at most 1 snapshot request per minute for any given table.
  70. // Replicas will only create one snapshot per day, but this stops us
  71. // from swamping the network.
  72. // Overridable via system property for testing.
  73. private static final long SNAPSHOT_INTERVAL_NANOS = TimeUnit.MINUTES.toNanos(1);
  74. private static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.BASIC_ISO_DATE;
  75. private final ConcurrentHashMap<TableId, AtomicLong> lastSnapshotTimes = new ConcurrentHashMap<>();
  76. public static void duplicateRows(TableMetadata metadata, Iterable<InetAddressAndPort> replicas)
  77. {
  78. instance.maybeTriggerSnapshot(metadata, DUPLICATE_ROWS_DETECTED_SNAPSHOT_PREFIX, replicas);
  79. }
  80. public static void repairedDataMismatch(TableMetadata metadata, Iterable<InetAddressAndPort> replicas)
  81. {
  82. instance.maybeTriggerSnapshot(metadata, REPAIRED_DATA_MISMATCH_SNAPSHOT_PREFIX, replicas);
  83. }
  84. public static boolean isDiagnosticSnapshotRequest(SnapshotCommand command)
  85. {
  86. return command.snapshot_name.startsWith(REPAIRED_DATA_MISMATCH_SNAPSHOT_PREFIX)
  87. || command.snapshot_name.startsWith(DUPLICATE_ROWS_DETECTED_SNAPSHOT_PREFIX);
  88. }
  89. public static void snapshot(SnapshotCommand command, InetAddressAndPort initiator)
  90. {
  91. Preconditions.checkArgument(isDiagnosticSnapshotRequest(command));
  92. instance.maybeSnapshot(command, initiator);
  93. }
  94. public static String getSnapshotName(String prefix)
  95. {
  96. return String.format("%s%s", prefix, DATE_FORMAT.format(LocalDate.now()));
  97. }
  98. @VisibleForTesting
  99. public void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException
  100. {
  101. ExecutorUtils.shutdownNowAndWait(timeout, unit, executor);
  102. }
  103. private void maybeTriggerSnapshot(TableMetadata metadata, String prefix, Iterable<InetAddressAndPort> endpoints)
  104. {
  105. long now = System.nanoTime();
  106. AtomicLong cached = lastSnapshotTimes.computeIfAbsent(metadata.id, u -> new AtomicLong(0));
  107. long last = cached.get();
  108. long interval = Long.getLong("cassandra.diagnostic_snapshot_interval_nanos", SNAPSHOT_INTERVAL_NANOS);
  109. if (now - last > interval && cached.compareAndSet(last, now))
  110. {
  111. Message<SnapshotCommand> msg = Message.out(Verb.SNAPSHOT_REQ,
  112. new SnapshotCommand(metadata.keyspace,
  113. metadata.name,
  114. getSnapshotName(prefix),
  115. false));
  116. for (InetAddressAndPort replica : endpoints)
  117. MessagingService.instance().send(msg, replica);
  118. }
  119. else
  120. {
  121. logger.debug("Diagnostic snapshot request dropped due to throttling");
  122. }
  123. }
  124. private void maybeSnapshot(SnapshotCommand command, InetAddressAndPort initiator)
  125. {
  126. executor.execute(new DiagnosticSnapshotTask(command, initiator));
  127. }
  128. private static class DiagnosticSnapshotTask implements Runnable
  129. {
  130. final SnapshotCommand command;
  131. final InetAddressAndPort from;
  132. DiagnosticSnapshotTask(SnapshotCommand command, InetAddressAndPort from)
  133. {
  134. this.command = command;
  135. this.from = from;
  136. }
  137. public void run()
  138. {
  139. try
  140. {
  141. Keyspace ks = Keyspace.open(command.keyspace);
  142. if (ks == null)
  143. {
  144. logger.info("Snapshot request received from {} for {}.{} but keyspace not found",
  145. from,
  146. command.keyspace,
  147. command.column_family);
  148. return;
  149. }
  150. ColumnFamilyStore cfs = ks.getColumnFamilyStore(command.column_family);
  151. if (cfs.snapshotExists(command.snapshot_name))
  152. {
  153. logger.info("Received diagnostic snapshot request from {} for {}.{}, " +
  154. "but snapshot with tag {} already exists",
  155. from,
  156. command.keyspace,
  157. command.column_family,
  158. command.snapshot_name);
  159. return;
  160. }
  161. logger.info("Creating snapshot requested by {} of {}.{} tag: {}",
  162. from,
  163. command.keyspace,
  164. command.column_family,
  165. command.snapshot_name);
  166. cfs.snapshot(command.snapshot_name);
  167. }
  168. catch (IllegalArgumentException e)
  169. {
  170. logger.warn("Snapshot request received from {} for {}.{} but CFS not found",
  171. from,
  172. command.keyspace,
  173. command.column_family);
  174. }
  175. }
  176. }
  177. }