PageRenderTime 27ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/java/focusedCrawler/crawler/Crawler.java

https://gitlab.com/nyimbi/ache
Java | 397 lines | 270 code | 49 blank | 78 comment | 10 complexity | 98bab3d25df960d7e93ec52a79e15b74 MD5 | raw file
  1. /*
  2. ############################################################################
  3. ##
  4. ## Copyright (C) 2006-2009 University of Utah. All rights reserved.
  5. ##
  6. ## This file is part of DeepPeep.
  7. ##
  8. ## This file may be used under the terms of the GNU General Public
  9. ## License version 2.0 as published by the Free Software Foundation
  10. ## and appearing in the file LICENSE.GPL included in the packaging of
  11. ## this file. Please review the following to ensure GNU General Public
  12. ## Licensing requirements will be met:
  13. ## http://www.opensource.org/licenses/gpl-license.php
  14. ##
  15. ## If you are unsure which license is appropriate for your use (for
  16. ## instance, you are interested in developing a commercial derivative
  17. ## of DeepPeep), please contact us at deeppeep@sci.utah.edu.
  18. ##
  19. ## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
  20. ## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  21. ##
  22. ############################################################################
  23. */
  24. package focusedCrawler.crawler;
  25. //crawler
  26. import java.net.URL;
  27. import org.slf4j.Logger;
  28. import org.slf4j.LoggerFactory;
  29. /**
  30. * <p>Description: This abstract class implements the partial behavior
  31. * of a webcrawler</p>
  32. *
  33. * <p>Copyright: Copyright (c) 2004</p>
  34. *
  35. * @author Luciano Barbosa
  36. * @version 1.0
  37. */
  38. public abstract class Crawler extends Thread {
  39. //possible states
  40. public static final int INIT = 0;
  41. public static final int SELECT_URL = INIT + 1;
  42. public static final int CHECK_URL = SELECT_URL + 1;
  43. public static final int DOWNLOAD_URL = CHECK_URL + 1;
  44. public static final int PROCESS_DATA = DOWNLOAD_URL + 1;
  45. public static final int CHECK_DATA = PROCESS_DATA + 1;
  46. public static final int SEND_DATA = CHECK_DATA + 1;
  47. public static final int END = SEND_DATA + 1;
  48. public static final int SLEEPING = END + 1;
  49. public static final int[] STATES = new int[]{INIT,
  50. SELECT_URL,CHECK_URL,DOWNLOAD_URL,
  51. PROCESS_DATA,CHECK_DATA,SEND_DATA,
  52. END,SLEEPING};
  53. public static final int DEAD = SLEEPING + 1;
  54. private static final Logger logger = LoggerFactory.getLogger(Crawler.class);
  55. private int status;
  56. private boolean stop;
  57. private long restingTime;
  58. private long sleepTime;
  59. private boolean jump;
  60. private long startCicleTime;
  61. private URL url;
  62. private long totalCicleTime;
  63. private long[] partitionTime = new long[STATES.length];
  64. private String message;
  65. private int selectedLinks = 0;
  66. private boolean shutdown;
  67. private CrawlerException lastException;
  68. public Crawler() {
  69. }
  70. public Crawler(String name) {
  71. super(name);
  72. defaults();
  73. }
  74. public Crawler(ThreadGroup g, String name) {
  75. super(g,name);
  76. defaults();
  77. }
  78. protected void defaults() {
  79. setShutdown(false);
  80. setStop(false);
  81. setRestingTime(0);
  82. setSleepTime(0);
  83. setUrl(null);
  84. setJump(false);
  85. setStartCicleTime(System.currentTimeMillis());
  86. }
  87. public void setStatus(int newStatus) {
  88. status = newStatus;
  89. }
  90. public int getStatus() {
  91. return status;
  92. }
  93. public void setStop(boolean newStop) {
  94. stop = newStop;
  95. }
  96. public boolean isStop() {
  97. return stop;
  98. }
  99. public void setRestingTime(long newRestingTime) {
  100. restingTime = newRestingTime;
  101. }
  102. public long getRestingTime() {
  103. return restingTime;
  104. }
  105. public void setSleepTime(long newSleepTime) {
  106. sleepTime = newSleepTime;
  107. }
  108. public long getSleepTime() {
  109. return sleepTime;
  110. }
  111. public void setJump(boolean newJump) {
  112. jump = newJump;
  113. }
  114. public boolean isJump() {
  115. return jump;
  116. }
  117. public void setJump(boolean newJump,String message) {
  118. logger.info(message);
  119. setJump(newJump);
  120. }
  121. public void setStartCicleTime(long newStartCicleTime) {
  122. startCicleTime = newStartCicleTime;
  123. }
  124. public long getStartCicleTime() {
  125. return startCicleTime;
  126. }
  127. public void setUrl(URL newUrl) {
  128. url = newUrl;
  129. }
  130. public URL getUrl() {
  131. return url;
  132. }
  133. public long getCicleTime() {
  134. return System.currentTimeMillis()-getStartCicleTime();
  135. }
  136. public void setTotalCicleTime(long newTotalCicleTime) {
  137. totalCicleTime = newTotalCicleTime;
  138. }
  139. public long getTotalCicleTime() {
  140. return totalCicleTime;
  141. }
  142. public void setPartitionTime(int index, long time) {
  143. partitionTime[index] = time;
  144. }
  145. public long getPartitionTime(int index) {
  146. return partitionTime[index];
  147. }
  148. public void setMessage(String newMessage) {
  149. message = newMessage;
  150. }
  151. public String getMessage() {
  152. return message;
  153. }
  154. /** Getter for property selectedLinks.
  155. * @return Value of property selectedLinks.
  156. */
  157. public int getSelectedLinks() {
  158. return selectedLinks;
  159. }
  160. /** Setter for property selectedLinks.
  161. * @param selectedLinks New value of property selectedLinks.
  162. */
  163. public void setSelectedLinks(int selectedLinks) {
  164. this.selectedLinks = selectedLinks;
  165. }
  166. public void setShutdown(boolean newShutdown) {
  167. shutdown = newShutdown;
  168. }
  169. public boolean isShutdown() {
  170. return shutdown;
  171. }
  172. public void setLastException(CrawlerException newLastException) {
  173. lastException = newLastException;
  174. }
  175. public CrawlerException getLastException() {
  176. return lastException;
  177. }
  178. /**
  179. * This method implements the main loop of the crawler, where the crawler
  180. * accomplishes all the steps needed to retrieve Web pages.
  181. */
  182. public void run() {
  183. long time = System.currentTimeMillis();
  184. while(!stop) {
  185. setStartCicleTime(System.currentTimeMillis());
  186. try {
  187. setStatus(INIT);
  188. setPartitionTime(INIT,System.currentTimeMillis()-time);
  189. if( jump ) {
  190. setJump(false);
  191. cleanup();
  192. continue;
  193. }
  194. setStatus(SELECT_URL);
  195. time = System.currentTimeMillis();
  196. selectUrl();
  197. setPartitionTime(SELECT_URL,System.currentTimeMillis()-time);
  198. if( jump ) {
  199. setJump(false);
  200. cleanup();
  201. continue;
  202. }
  203. logger.trace(getName()+">after request url");
  204. setStatus(CHECK_URL);
  205. time = System.currentTimeMillis();
  206. checkUrl();
  207. setPartitionTime(CHECK_URL,System.currentTimeMillis()-time);
  208. if( jump ) {
  209. setJump(false);
  210. cleanup();
  211. continue;
  212. }
  213. logger.trace(getName()+">after check url");
  214. setStatus(DOWNLOAD_URL);
  215. time = System.currentTimeMillis();
  216. downloadUrl();
  217. setPartitionTime(DOWNLOAD_URL,System.currentTimeMillis()-time);
  218. if( jump ) {
  219. setJump(false);
  220. cleanup();
  221. continue;
  222. }
  223. logger.trace(getName()+">after download data");
  224. setStatus(PROCESS_DATA);
  225. time = System.currentTimeMillis();
  226. processData();
  227. setPartitionTime(PROCESS_DATA,System.currentTimeMillis()-time);
  228. if( jump ) {
  229. setJump(false);
  230. cleanup();
  231. continue;
  232. }
  233. logger.trace(getName()+">after process data");
  234. setStatus(CHECK_DATA);
  235. time = System.currentTimeMillis();
  236. checkData();
  237. setPartitionTime(CHECK_DATA,System.currentTimeMillis()-time);
  238. if( jump ) {
  239. setJump(false);
  240. cleanup();
  241. continue;
  242. }
  243. logger.trace(getName()+">after check data");
  244. setStatus(SEND_DATA);
  245. time = System.currentTimeMillis();
  246. sendData();
  247. setPartitionTime(SEND_DATA,System.currentTimeMillis()-time);
  248. logger.trace(getName()+">after send data");
  249. setLastException(null);
  250. setStatus(END);
  251. time = System.currentTimeMillis();
  252. restingSleep();
  253. }
  254. catch(CrawlerException re) {
  255. logger.error(re.getMessage(), re);
  256. if( re.detail != null ) {
  257. re.detail.printStackTrace();
  258. }
  259. setLastException(re);
  260. try {
  261. setStatus(SLEEPING);
  262. time = System.currentTimeMillis();
  263. if( !stop ) {
  264. logger.info("Sleeping "+sleepTime+" mls due to last error.");
  265. sleep(sleepTime);
  266. }
  267. setPartitionTime(SLEEPING,System.currentTimeMillis()-time);
  268. }
  269. catch( InterruptedException ie ) {
  270. logger.error("Sleeping interrupted.", ie);
  271. }
  272. }
  273. finally {
  274. try {
  275. cleanup();
  276. }
  277. catch(Exception exc) {
  278. logger.info("Problem while executing cleanup.", exc);
  279. }
  280. setPartitionTime(END,System.currentTimeMillis()-time);
  281. setTotalCicleTime(System.currentTimeMillis() - getStartCicleTime());
  282. }
  283. String parts = "";
  284. for(int i = 0; i < STATES.length; i++) {
  285. parts += (i==0?""+getPartitionTime(i):","+getPartitionTime(i));
  286. }
  287. logger.info("Total time is "+getTotalCicleTime()+" mls ["+parts+"]");
  288. }
  289. try {
  290. logger.info("Thread dead, calling cleanup().");
  291. setStatus(DEAD);
  292. cleanup();
  293. logger.info("Thread dead cleanup() done.");
  294. }
  295. catch(Exception exc) {
  296. logger.info("Problem while finishing crawler thread.", exc);
  297. }
  298. }
  299. public void restingSleep() {
  300. try {
  301. sleep(restingTime);
  302. }
  303. catch(InterruptedException exc) {
  304. logger.info("Sleeping interrupted.", exc);
  305. }
  306. }
  307. /**
  308. * This method gets the next URL to be processed.
  309. * @throws CrawlerException
  310. */
  311. abstract protected void selectUrl() throws CrawlerException;
  312. /**
  313. * It checks if there is any constraint about the given URL
  314. *
  315. * @throws CrawlerException
  316. */
  317. abstract protected void checkUrl() throws CrawlerException;
  318. /**
  319. * This method dowloads the given URL.
  320. *
  321. * @throws CrawlerException
  322. */
  323. abstract protected void downloadUrl() throws CrawlerException;
  324. /**
  325. * This method processes the URL content
  326. *
  327. * @throws CrawlerException
  328. */
  329. abstract protected void processData() throws CrawlerException;
  330. /**
  331. * It checks if there is any constraint about the processed data.
  332. *
  333. * @throws CrawlerException
  334. */
  335. abstract protected void checkData() throws CrawlerException;
  336. /**
  337. * This method sends data already processed.
  338. *
  339. * @throws CrawlerException
  340. */
  341. abstract protected void sendData() throws CrawlerException;
  342. /**
  343. * This method cleans up any temporary attribute/variable
  344. *
  345. * @throws CrawlerException
  346. */
  347. abstract protected void cleanup() throws CrawlerException;
  348. }