PageRenderTime 36ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java

https://gitlab.com/taichu/webmagic
Java | 128 lines | 105 code | 17 blank | 6 comment | 7 complexity | be0cf94a32fdee8e55f232be6bb0b2e6 MD5 | raw file
  1. package us.codecraft.webmagic.scheduler;
  2. import com.alibaba.fastjson.JSON;
  3. import org.apache.commons.codec.digest.DigestUtils;
  4. import redis.clients.jedis.Jedis;
  5. import redis.clients.jedis.JedisPool;
  6. import redis.clients.jedis.JedisPoolConfig;
  7. import us.codecraft.webmagic.Request;
  8. import us.codecraft.webmagic.Task;
  9. import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
  10. /**
  11. * Use Redis as url scheduler for distributed crawlers.<br>
  12. *
  13. * @author code4crafter@gmail.com <br>
  14. * @since 0.2.0
  15. */
  16. public class RedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover {
  17. private JedisPool pool;
  18. private static final String QUEUE_PREFIX = "queue_";
  19. private static final String SET_PREFIX = "set_";
  20. private static final String ITEM_PREFIX = "item_";
  21. public RedisScheduler(String host) {
  22. this(new JedisPool(new JedisPoolConfig(), host));
  23. }
  24. public RedisScheduler(JedisPool pool) {
  25. this.pool = pool;
  26. setDuplicateRemover(this);
  27. }
  28. @Override
  29. public void resetDuplicateCheck(Task task) {
  30. Jedis jedis = pool.getResource();
  31. try {
  32. jedis.del(getSetKey(task));
  33. } finally {
  34. pool.returnResource(jedis);
  35. }
  36. }
  37. @Override
  38. public boolean isDuplicate(Request request, Task task) {
  39. Jedis jedis = pool.getResource();
  40. try {
  41. boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
  42. if (!isDuplicate) {
  43. jedis.sadd(getSetKey(task), request.getUrl());
  44. }
  45. return isDuplicate;
  46. } finally {
  47. pool.returnResource(jedis);
  48. }
  49. }
  50. @Override
  51. protected void pushWhenNoDuplicate(Request request, Task task) {
  52. Jedis jedis = pool.getResource();
  53. try {
  54. jedis.rpush(getQueueKey(task), request.getUrl());
  55. if (request.getExtras() != null) {
  56. String field = DigestUtils.shaHex(request.getUrl());
  57. String value = JSON.toJSONString(request);
  58. jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
  59. }
  60. } finally {
  61. pool.returnResource(jedis);
  62. }
  63. }
  64. @Override
  65. public synchronized Request poll(Task task) {
  66. Jedis jedis = pool.getResource();
  67. try {
  68. String url = jedis.lpop(getQueueKey(task));
  69. if (url == null) {
  70. return null;
  71. }
  72. String key = ITEM_PREFIX + task.getUUID();
  73. String field = DigestUtils.shaHex(url);
  74. byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
  75. if (bytes != null) {
  76. Request o = JSON.parseObject(new String(bytes), Request.class);
  77. return o;
  78. }
  79. Request request = new Request(url);
  80. return request;
  81. } finally {
  82. pool.returnResource(jedis);
  83. }
  84. }
  85. protected String getSetKey(Task task) {
  86. return SET_PREFIX + task.getUUID();
  87. }
  88. protected String getQueueKey(Task task) {
  89. return QUEUE_PREFIX + task.getUUID();
  90. }
  91. @Override
  92. public int getLeftRequestsCount(Task task) {
  93. Jedis jedis = pool.getResource();
  94. try {
  95. Long size = jedis.llen(getQueueKey(task));
  96. return size.intValue();
  97. } finally {
  98. pool.returnResource(jedis);
  99. }
  100. }
  101. @Override
  102. public int getTotalRequestsCount(Task task) {
  103. Jedis jedis = pool.getResource();
  104. try {
  105. Long size = jedis.scard(getSetKey(task));
  106. return size.intValue();
  107. } finally {
  108. pool.returnResource(jedis);
  109. }
  110. }
  111. }