/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
Java | 128 lines | 105 code | 17 blank | 6 comment | 7 complexity | be0cf94a32fdee8e55f232be6bb0b2e6 MD5 | raw file
1package us.codecraft.webmagic.scheduler;
2
3import com.alibaba.fastjson.JSON;
4import org.apache.commons.codec.digest.DigestUtils;
5import redis.clients.jedis.Jedis;
6import redis.clients.jedis.JedisPool;
7import redis.clients.jedis.JedisPoolConfig;
8import us.codecraft.webmagic.Request;
9import us.codecraft.webmagic.Task;
10import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
11
12/**
13 * Use Redis as url scheduler for distributed crawlers.<br>
14 *
15 * @author code4crafter@gmail.com <br>
16 * @since 0.2.0
17 */
18public class RedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover {
19
20 private JedisPool pool;
21
22 private static final String QUEUE_PREFIX = "queue_";
23
24 private static final String SET_PREFIX = "set_";
25
26 private static final String ITEM_PREFIX = "item_";
27
28 public RedisScheduler(String host) {
29 this(new JedisPool(new JedisPoolConfig(), host));
30 }
31
32 public RedisScheduler(JedisPool pool) {
33 this.pool = pool;
34 setDuplicateRemover(this);
35 }
36
37 @Override
38 public void resetDuplicateCheck(Task task) {
39 Jedis jedis = pool.getResource();
40 try {
41 jedis.del(getSetKey(task));
42 } finally {
43 pool.returnResource(jedis);
44 }
45 }
46
47 @Override
48 public boolean isDuplicate(Request request, Task task) {
49 Jedis jedis = pool.getResource();
50 try {
51 boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
52 if (!isDuplicate) {
53 jedis.sadd(getSetKey(task), request.getUrl());
54 }
55 return isDuplicate;
56 } finally {
57 pool.returnResource(jedis);
58 }
59
60 }
61
62 @Override
63 protected void pushWhenNoDuplicate(Request request, Task task) {
64 Jedis jedis = pool.getResource();
65 try {
66 jedis.rpush(getQueueKey(task), request.getUrl());
67 if (request.getExtras() != null) {
68 String field = DigestUtils.shaHex(request.getUrl());
69 String value = JSON.toJSONString(request);
70 jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
71 }
72 } finally {
73 pool.returnResource(jedis);
74 }
75 }
76
77 @Override
78 public synchronized Request poll(Task task) {
79 Jedis jedis = pool.getResource();
80 try {
81 String url = jedis.lpop(getQueueKey(task));
82 if (url == null) {
83 return null;
84 }
85 String key = ITEM_PREFIX + task.getUUID();
86 String field = DigestUtils.shaHex(url);
87 byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
88 if (bytes != null) {
89 Request o = JSON.parseObject(new String(bytes), Request.class);
90 return o;
91 }
92 Request request = new Request(url);
93 return request;
94 } finally {
95 pool.returnResource(jedis);
96 }
97 }
98
99 protected String getSetKey(Task task) {
100 return SET_PREFIX + task.getUUID();
101 }
102
103 protected String getQueueKey(Task task) {
104 return QUEUE_PREFIX + task.getUUID();
105 }
106
107 @Override
108 public int getLeftRequestsCount(Task task) {
109 Jedis jedis = pool.getResource();
110 try {
111 Long size = jedis.llen(getQueueKey(task));
112 return size.intValue();
113 } finally {
114 pool.returnResource(jedis);
115 }
116 }
117
118 @Override
119 public int getTotalRequestsCount(Task task) {
120 Jedis jedis = pool.getResource();
121 try {
122 Long size = jedis.scard(getSetKey(task));
123 return size.intValue();
124 } finally {
125 pool.returnResource(jedis);
126 }
127 }
128}