PageRenderTime 93ms CodeModel.GetById 31ms RepoModel.GetById 1ms app.codeStats 0ms

/src/java/org/apache/cassandra/db/HintedHandOffManager.java

https://github.com/cchandler/cassandra
Java | 300 lines | 212 code | 30 blank | 58 comment | 23 complexity | 1639cb835b799553002129177a803086 MD5 | raw file
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one
  3. * or more contributor license agreements. See the NOTICE file
  4. * distributed with this work for additional information
  5. * regarding copyright ownership. The ASF licenses this file
  6. * to you under the Apache License, Version 2.0 (the
  7. * "License"); you may not use this file except in compliance
  8. * with the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. package org.apache.cassandra.db;
  19. import java.io.IOException;
  20. import java.net.InetAddress;
  21. import java.net.UnknownHostException;
  22. import java.util.Collection;
  23. import java.util.concurrent.ExecutorService;
  24. import java.util.concurrent.TimeoutException;
  25. import org.apache.cassandra.db.IClock;
  26. import org.apache.commons.lang.ArrayUtils;
  27. import org.slf4j.Logger;
  28. import org.slf4j.LoggerFactory;
  29. import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
  30. import org.apache.cassandra.db.filter.QueryFilter;
  31. import org.apache.cassandra.db.filter.QueryPath;
  32. import org.apache.cassandra.gms.FailureDetector;
  33. import org.apache.cassandra.gms.Gossiper;
  34. import org.apache.cassandra.net.Message;
  35. import org.apache.cassandra.net.MessagingService;
  36. import org.apache.cassandra.service.DigestMismatchException;
  37. import org.apache.cassandra.service.StorageService;
  38. import org.apache.cassandra.service.WriteResponseHandler;
  39. import org.apache.cassandra.thrift.InvalidRequestException;
  40. import org.apache.cassandra.utils.WrappedRunnable;
  41. import org.cliffc.high_scale_lib.NonBlockingHashSet;
  42. import static com.google.common.base.Charsets.UTF_8;
  43. /**
  44. * For each endpoint for which we have hints, there is a row in the system hints CF.
  45. * SuperColumns in that row are keys for which we have hinted data.
  46. * Subcolumns names within that supercolumn are keyspace+CF, concatenated with SEPARATOR.
  47. * Subcolumn values are always empty; instead, we store the row data "normally"
  48. * in the application table it belongs in.
  49. *
  50. * When FailureDetector signals that a node that was down is back up, we read its
  51. * hints row to see what rows we need to forward data for, then reach each row in its
  52. * entirety and send it over.
  53. * (TODO handle rows that have incrementally grown too large for a single message.)
  54. *
  55. * deliverHints is also exposed to JMX so it can be run manually if FD ever misses
  56. * its cue somehow.
  57. *
  58. * HHM never deletes the row from Application tables; there is no way to distinguish that
  59. * from hinted tombstones! instead, rely on cleanup compactions to remove data
  60. * that doesn't belong on this node. (Cleanup compactions may be started manually
  61. * -- on a per node basis -- with "nodeprobe cleanup.")
  62. *
  63. * TODO this avoids our hint rows from growing excessively large by offloading the
  64. * message data into application tables. But, this means that cleanup compactions
  65. * will nuke HH data. Probably better would be to store the RowMutation messages
  66. * in a HHData (non-super) CF, modifying the above to store a UUID value in the
  67. * HH subcolumn value, which we use as a key to a [standard] HHData system CF
  68. * that would contain the message bytes.
  69. */
  70. public class HintedHandOffManager
  71. {
  72. public static final HintedHandOffManager instance = new HintedHandOffManager();
  73. private static final Logger logger_ = LoggerFactory.getLogger(HintedHandOffManager.class);
  74. public static final String HINTS_CF = "HintsColumnFamily";
  75. private static final int PAGE_SIZE = 10000;
  76. private static final String SEPARATOR = "-";
  77. private final NonBlockingHashSet<InetAddress> queuedDeliveries = new NonBlockingHashSet<InetAddress>();
  78. private final ExecutorService executor_;
  79. public HintedHandOffManager()
  80. {
  81. int hhPriority = System.getProperty("cassandra.compaction.priority") == null
  82. ? Thread.NORM_PRIORITY
  83. : Integer.parseInt(System.getProperty("cassandra.compaction.priority"));
  84. executor_ = new JMXEnabledThreadPoolExecutor("HINTED-HANDOFF-POOL", hhPriority);
  85. }
  86. private static boolean sendMessage(InetAddress endpoint, String tableName, String cfName, byte[] key) throws IOException
  87. {
  88. if (!Gossiper.instance.isKnownEndpoint(endpoint))
  89. {
  90. logger_.warn("Hints found for endpoint " + endpoint + " which is not part of the gossip network. discarding.");
  91. return true;
  92. }
  93. if (!FailureDetector.instance.isAlive(endpoint))
  94. {
  95. return false;
  96. }
  97. Table table = Table.open(tableName);
  98. RowMutation rm = new RowMutation(tableName, key);
  99. DecoratedKey dkey = StorageService.getPartitioner().decorateKey(key);
  100. ColumnFamilyStore cfs = table.getColumnFamilyStore(cfName);
  101. ColumnFamily cf = cfs.getColumnFamily(QueryFilter.getIdentityFilter(dkey, new QueryPath(cfs.getColumnFamilyName())));
  102. if (cf != null)
  103. rm.add(cf);
  104. Message message = rm.makeRowMutationMessage();
  105. WriteResponseHandler responseHandler = new WriteResponseHandler(endpoint);
  106. MessagingService.instance.sendRR(message, new InetAddress[] { endpoint }, responseHandler);
  107. try
  108. {
  109. responseHandler.get();
  110. }
  111. catch (TimeoutException e)
  112. {
  113. return false;
  114. }
  115. return true;
  116. }
  117. private static void deleteHintKey(byte[] endpointAddress, byte[] key, byte[] tableCF, IClock clock) throws IOException
  118. {
  119. RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, endpointAddress);
  120. rm.delete(new QueryPath(HINTS_CF, key, tableCF), clock);
  121. rm.apply();
  122. }
  123. public static void deleteHintsForEndPoint(InetAddress endpoint)
  124. {
  125. ColumnFamilyStore hintStore = Table.open(Table.SYSTEM_TABLE).getColumnFamilyStore(HINTS_CF);
  126. RowMutation rm = new RowMutation(Table.SYSTEM_TABLE, endpoint.getAddress());
  127. rm.delete(new QueryPath(HINTS_CF), new TimestampClock(System.currentTimeMillis()));
  128. try {
  129. logger_.info("Deleting any stored hints for " + endpoint);
  130. rm.apply();
  131. hintStore.forceFlush();
  132. CompactionManager.instance.submitMajor(hintStore, 0, Integer.MAX_VALUE).get();
  133. }
  134. catch (Exception e)
  135. {
  136. logger_.warn("Could not delete hints for " + endpoint + ": " + e);
  137. }
  138. }
  139. private static boolean pagingFinished(ColumnFamily hintColumnFamily, byte[] startColumn)
  140. {
  141. // done if no hints found or the start column (same as last column processed in previous iteration) is the only one
  142. return hintColumnFamily == null
  143. || (hintColumnFamily.getSortedColumns().size() == 1 && hintColumnFamily.getColumn(startColumn) != null);
  144. }
  145. public static byte[] makeCombinedName(String tableName, String columnFamily)
  146. {
  147. byte[] withsep = ArrayUtils.addAll(tableName.getBytes(UTF_8), SEPARATOR.getBytes());
  148. return ArrayUtils.addAll(withsep, columnFamily.getBytes(UTF_8));
  149. }
  150. private static String[] getTableAndCFNames(byte[] joined)
  151. {
  152. int index;
  153. index = ArrayUtils.lastIndexOf(joined, SEPARATOR.getBytes()[0]);
  154. if (index < 1)
  155. throw new RuntimeException("Corrupted hint name " + joined.toString());
  156. String[] parts = new String[2];
  157. parts[0] = new String(ArrayUtils.subarray(joined, 0, index));
  158. parts[1] = new String(ArrayUtils.subarray(joined, index+1, joined.length));
  159. return parts;
  160. }
  161. private void deliverHintsToEndpoint(InetAddress endpoint) throws IOException, DigestMismatchException, InvalidRequestException, TimeoutException
  162. {
  163. logger_.info("Started hinted handoff for endpoint " + endpoint);
  164. queuedDeliveries.remove(endpoint);
  165. // 1. Get the key of the endpoint we need to handoff
  166. // 2. For each column read the list of rows: subcolumns are KS + SEPARATOR + CF
  167. // 3. Delete the subcolumn if the write was successful
  168. // 4. Force a flush
  169. // 5. Do major compaction to clean up all deletes etc.
  170. DecoratedKey epkey = StorageService.getPartitioner().decorateKey(endpoint.getAddress());
  171. int rowsReplayed = 0;
  172. ColumnFamilyStore hintStore = Table.open(Table.SYSTEM_TABLE).getColumnFamilyStore(HINTS_CF);
  173. byte[] startColumn = ArrayUtils.EMPTY_BYTE_ARRAY;
  174. delivery:
  175. while (true)
  176. {
  177. QueryFilter filter = QueryFilter.getSliceFilter(epkey, new QueryPath(HINTS_CF), startColumn, ArrayUtils.EMPTY_BYTE_ARRAY, null, false, PAGE_SIZE);
  178. ColumnFamily hintColumnFamily = ColumnFamilyStore.removeDeleted(hintStore.getColumnFamily(filter), Integer.MAX_VALUE);
  179. if (pagingFinished(hintColumnFamily, startColumn))
  180. break;
  181. Collection<IColumn> keyColumns = hintColumnFamily.getSortedColumns();
  182. for (IColumn keyColumn : keyColumns)
  183. {
  184. startColumn = keyColumn.name();
  185. Collection<IColumn> tableCFs = keyColumn.getSubColumns();
  186. for (IColumn tableCF : tableCFs)
  187. {
  188. String[] parts = getTableAndCFNames(tableCF.name());
  189. if (sendMessage(endpoint, parts[0], parts[1], keyColumn.name()))
  190. {
  191. deleteHintKey(endpoint.getAddress(), keyColumn.name(), tableCF.name(), tableCF.clock());
  192. rowsReplayed++;
  193. }
  194. else
  195. {
  196. logger_.info("Could not complete hinted handoff to " + endpoint);
  197. break delivery;
  198. }
  199. startColumn = keyColumn.name();
  200. }
  201. }
  202. }
  203. if (rowsReplayed > 0)
  204. {
  205. hintStore.forceFlush();
  206. try
  207. {
  208. CompactionManager.instance.submitMajor(hintStore, 0, Integer.MAX_VALUE).get();
  209. }
  210. catch (Exception e)
  211. {
  212. throw new RuntimeException(e);
  213. }
  214. }
  215. logger_.info(String.format("Finished hinted handoff of %s rows to endpoint %s",
  216. rowsReplayed, endpoint));
  217. }
  218. /** called when a keyspace is dropped or rename. newTable==null in the case of a drop. */
  219. public static void renameHints(String oldTable, String newTable) throws IOException
  220. {
  221. DecoratedKey oldTableKey = StorageService.getPartitioner().decorateKey(oldTable.getBytes(UTF_8));
  222. // we're basically going to fetch, drop and add the scf for the old and new table. we need to do it piecemeal
  223. // though since there could be GB of data.
  224. ColumnFamilyStore hintStore = Table.open(Table.SYSTEM_TABLE).getColumnFamilyStore(HINTS_CF);
  225. byte[] startCol = ArrayUtils.EMPTY_BYTE_ARRAY;
  226. long now = System.currentTimeMillis();
  227. while (true)
  228. {
  229. QueryFilter filter = QueryFilter.getSliceFilter(oldTableKey, new QueryPath(HINTS_CF), startCol, ArrayUtils.EMPTY_BYTE_ARRAY, null, false, PAGE_SIZE);
  230. ColumnFamily cf = ColumnFamilyStore.removeDeleted(hintStore.getColumnFamily(filter), Integer.MAX_VALUE);
  231. if (pagingFinished(cf, startCol))
  232. break;
  233. if (newTable != null)
  234. {
  235. RowMutation insert = new RowMutation(Table.SYSTEM_TABLE, newTable.getBytes(UTF_8));
  236. insert.add(cf);
  237. insert.apply();
  238. }
  239. RowMutation drop = new RowMutation(Table.SYSTEM_TABLE, oldTableKey.key);
  240. for (byte[] key : cf.getColumnNames())
  241. {
  242. drop.delete(new QueryPath(HINTS_CF, key), new TimestampClock(now));
  243. startCol = key;
  244. }
  245. drop.apply();
  246. }
  247. }
  248. /*
  249. * This method is used to deliver hints to a particular endpoint.
  250. * When we learn that some endpoint is back up we deliver the data
  251. * to him via an event driven mechanism.
  252. */
  253. public void deliverHints(final InetAddress to)
  254. {
  255. if (!queuedDeliveries.add(to))
  256. return;
  257. Runnable r = new WrappedRunnable()
  258. {
  259. public void runMayThrow() throws Exception
  260. {
  261. deliverHintsToEndpoint(to);
  262. }
  263. };
  264. executor_.submit(r);
  265. }
  266. public void deliverHints(String to) throws UnknownHostException
  267. {
  268. deliverHints(InetAddress.getByName(to));
  269. }
  270. }