PageRenderTime 59ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/skkj_server-english-webmagic-0580f61df038/src/main/java/net/onemost/english/service/RepoPageProcessorService.java

https://bitbucket.org/tonyfeng1002/english-crawler
Java | 301 lines | 177 code | 43 blank | 81 comment | 15 complexity | 86b4046644d3a8901d7afd897b8f42bc MD5 | raw file
  1. package net.onemost.english.service;
  2. import java.io.IOException;
  3. import java.util.ArrayList;
  4. import java.util.HashMap;
  5. import java.util.List;
  6. import java.util.Map;
  7. import com.alibaba.fastjson.JSON;
  8. import com.alibaba.fastjson.JSONObject;
  9. import net.onemost.english.dao.BookMapper;
  10. import net.onemost.english.dao.WordMapper;
  11. import net.onemost.english.dto.LoginInfo;
  12. import net.onemost.english.model.Book;
  13. import net.onemost.english.model.Word;
  14. import net.onemost.english.util.DBTools;
  15. import net.onemost.english.util.FileOpertionUtil;
  16. import net.onemost.english.util.GetCode;
  17. import us.codecraft.webmagic.Page;
  18. import us.codecraft.webmagic.Request;
  19. import us.codecraft.webmagic.Site;
  20. import us.codecraft.webmagic.Spider;
  21. import us.codecraft.webmagic.processor.PageProcessor;
  22. import us.codecraft.webmagic.utils.HttpConstant;
  23. public class RepoPageProcessorService implements PageProcessor {
  24. private static BookMapper bookMapper = DBTools.getSession().getMapper(BookMapper.class);
  25. private static WordMapper wordMapper = DBTools.getSession().getMapper(WordMapper.class);
  26. private static RepoPageProcessorService repoPageProcessorService;
  27. private Page page;
  28. private static final String POST_LOGIN_URL = "https://www.initwords.com/login/authless/ajaxLogin.do";
  29. private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(3000)
  30. .addHeader("Accept",
  31. "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
  32. .addHeader("Accept-Encoding", "gzip, deflate, br").addHeader("Accept-Language", "zh-CN,zh;q=0.9")
  33. .addHeader("Cache-Control", "no-cache")
  34. .addHeader("User-Agent",
  35. "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36")
  36. .addCookie("Hm_lvt_49a5957871e8051bc1a873596375812d", "1520235720,1520240018,1520298497,1520298867")
  37. .addCookie("Hm_lpvt_49a5957871e8051bc1a873596375812d", "1520298867")
  38. .addHeader("Referer", "https://www.initwords.com/studycenter/studyhome.jsp")
  39. .addHeader("Origin", "https://www.initwords.com");
  40. public RepoPageProcessorService(String jsessionId, String userCode, String tokenCode) {
  41. site.addHeader("EM-TokenCode", tokenCode).addHeader("EM-UserCode", userCode).addCookie("JSESSIONID",
  42. jsessionId);
  43. }
  44. public RepoPageProcessorService(String jsessionId) {
  45. site.addCookie("JSESSIONID", jsessionId);
  46. }
  47. @Override
  48. public void process(Page page) {
  49. this.page = page;
  50. }
  51. @Override
  52. public Site getSite() {
  53. return site;
  54. }
  55. public Page getPage() {
  56. return page;
  57. }
  58. public static void start(LoginInfo loginInfo) throws IOException {
  59. // 获取该用户订阅的所有课程
  60. repoPageProcessorService = new RepoPageProcessorService(loginInfo.getJsessionId(), loginInfo.getUserCode(),
  61. loginInfo.getTokenCode());
  62. Spider spider = Spider.create(repoPageProcessorService);
  63. visiteHomePage(spider, loginInfo.getUserCode());
  64. Page page = repoPageProcessorService.getPage();
  65. JSONObject jsonObject = JSON.parseObject(page.getJson().toString());
  66. List<JSONObject> list = JSON.parseArray(jsonObject.getJSONObject("data").getString("userbooks"),
  67. JSONObject.class);
  68. if (list.size() == 0) {
  69. throw new RuntimeException("获取课程失败");
  70. }
  71. List<String> bookModelCodes = new ArrayList<>();
  72. for (JSONObject node : list) {
  73. bookModelCodes.add(node.getString("moduleCode"));
  74. }
  75. // 开始抓取单词
  76. getLearnInfo(spider, bookModelCodes, loginInfo.getUserCode());
  77. }
  78. /*public static void start(String jsessionId, String username, String password) throws IOException {
  79. //获取登录信息
  80. LoginInfo loginInfo = login(jsessionId, username, password);
  81. // 获取该用户订阅的所有课程
  82. repoPageProcessorService = new RepoPageProcessorService(loginInfo.getJsessionId(), loginInfo.getUserCode(),
  83. loginInfo.getTokenCode());
  84. Spider spider = Spider.create(repoPageProcessorService);
  85. visiteHomePage(spider, loginInfo.getUserCode());
  86. Page page = repoPageProcessorService.getPage();
  87. JSONObject jsonObject = JSON.parseObject(page.getJson().toString());
  88. List<JSONObject> list = JSON.parseArray(jsonObject.getJSONObject("data").getString("userbooks"),
  89. JSONObject.class);
  90. if (list.size() == 0) {
  91. throw new RuntimeException("获取课程失败");
  92. }
  93. List<String> bookModelCodes = new ArrayList<>();
  94. for (JSONObject node : list) {
  95. bookModelCodes.add(node.getString("moduleCode"));
  96. }
  97. // 开始抓取单词
  98. getLearnInfo(spider, bookModelCodes, loginInfo.getUserCode());
  99. }*/
  100. protected static LoginInfo login(String jsessionId, String username, String password) throws IOException {
  101. //查询是否存在配置文件,存在则读取配置文件里面的信息
  102. if(FileOpertionUtil.getLoginFile()) {
  103. Map<String, String> map = FileOpertionUtil.getConfig();
  104. LoginInfo loginInfo = new LoginInfo();
  105. loginInfo.setJsessionId(map.get("session"));
  106. loginInfo.setTokenCode(map.get("tokenCode"));
  107. loginInfo.setUserCode(map.get("userCode"));
  108. return loginInfo;
  109. }else {
  110. repoPageProcessorService = new RepoPageProcessorService(jsessionId);
  111. Spider spider = Spider.create(repoPageProcessorService);
  112. String url = POST_LOGIN_URL + "?userId=" + username + "&userPwd=" + password
  113. + "&siteName=xfinit&loginType=studentLogin";
  114. Request request = new Request(url);
  115. Map<String, Object> variales = new HashMap<>();
  116. request.setMethod(HttpConstant.Method.POST);
  117. request.setExtras(variales);
  118. spider.addRequest(request);
  119. spider.run();
  120. Page page = repoPageProcessorService.getPage();
  121. LoginInfo loginInfo = GetCode.getInfo(page);
  122. FileOpertionUtil.createLoginFile(jsessionId, loginInfo.getUserCode(), loginInfo.getTokenCode());
  123. return loginInfo;
  124. }
  125. }
  126. /**
  127. * 获取课程信息
  128. *
  129. * @param spider
  130. */
  131. protected static void visiteHomePage(Spider spider, String userCode) {
  132. Request request = new Request(
  133. "https://www.initwords.com/book/getStudentBooks.do?studentCode=" + userCode + "&isNeedStudyPos=true");
  134. Map<String, Object> variales = new HashMap<>();
  135. request.setMethod(HttpConstant.Method.POST);
  136. request.setExtras(variales);
  137. spider.addRequest(request);
  138. spider.run();
  139. }
  140. /**
  141. * 获取模块信息
  142. *
  143. * @param spider
  144. */
  145. protected static void getLearnInfo(Spider spider, List<String> codes, String userCode) {
  146. for (String string : codes) {
  147. Book book = getBookInfo(string, userCode, spider);
  148. if(book == null) {
  149. continue;
  150. }
  151. Integer totalNum = book.getTotalunitnbr();
  152. for (int i = book.getStartNum(); i <= totalNum + book.getStartNum(); i++) {
  153. List<Word> list = getWordList(book.getBookCode(), userCode, spider, i, book.getId());
  154. }
  155. }
  156. }
  157. /**
  158. * 获取书本信息
  159. *
  160. * @param codes
  161. * @param userCode
  162. * @param spider
  163. * @return
  164. */
  165. protected static Book getBookInfo(String codes, String userCode, Spider spider) {
  166. String url = "https://www.initwords.com/unitstudy/ajaxUnitStudy.do?unitNbr=1&moduleCode=" + codes + "&userCode="
  167. + userCode;
  168. Request request = new Request(url);
  169. request.setMethod(HttpConstant.Method.POST);
  170. spider.addRequest(request);
  171. spider.run();
  172. Page page = repoPageProcessorService.getPage();
  173. JSONObject jsonObject = JSON.parseObject(page.getJson().toString()).getJSONObject("data")
  174. .getJSONObject("bookInfo");
  175. if (jsonObject == null) {
  176. return null;
  177. }
  178. Book book = new Book();
  179. book.setBookCode(jsonObject.getString("moduleCode"));
  180. book.setBookName(jsonObject.getString("bookName"));
  181. book.setIntroduce(jsonObject.getString("introduce"));
  182. book.setTotalunitnbr(jsonObject.getInteger("totalUnitNbr"));
  183. book.setModelName(jsonObject.getString("moduleName"));
  184. book.setPublisher(jsonObject.getString("publisher"));
  185. book.setBookGroupName(jsonObject.getString("bookGroupName"));
  186. book.setGroupName(jsonObject.getString("groupName"));
  187. book.setStartNum(jsonObject.getInteger("startFrom"));
  188. List<Book> bookList = bookMapper.selectAll();
  189. /*System.out.println(book.getBookName());
  190. if(!bookList.isEmpty()) {
  191. for(Book bk:bookList) {
  192. System.out.println(bk.getBookName());
  193. if(!bk.getBookName().equals(book.getBookName())) {
  194. bookMapper.insert(book);
  195. DBTools.getSession().commit();
  196. return book;
  197. }
  198. }
  199. }*/
  200. // return null;
  201. // bookMapper.insert(book);
  202. List<String> courseNameList = new ArrayList<>();
  203. for(Book bk:bookList) {
  204. courseNameList.add(bk.getBookName());
  205. }
  206. if(!courseNameList.contains(book.getBookName())) {
  207. bookMapper.insert(book);
  208. DBTools.getSession().commit();
  209. return book;
  210. }
  211. return null;
  212. // bookMapper.insert(book);
  213. // DBTools.getSession().commit();
  214. // return book;
  215. }
  216. /**
  217. * 获取单词列表
  218. *
  219. * @param bookCode
  220. * @param userCode
  221. * @param spider
  222. * @param current
  223. * @return
  224. */
  225. protected static List<Word> getWordList(String bookCode, String userCode, Spider spider, Integer current, Integer bookId) {
  226. String url = "https://www.initwords.com/unitstudy/ajaxUnitStudy.do?unitNbr=" + current + "&moduleCode="
  227. + bookCode + "&userCode=" + userCode;
  228. Request request = new Request(url);
  229. request.setMethod(HttpConstant.Method.POST);
  230. spider.addRequest(request);
  231. spider.run();
  232. Page page = repoPageProcessorService.getPage();
  233. JSONObject jsonObject = JSON.parseObject(page.getJson().toString()).getJSONObject("data");
  234. List<Word> words = JSON.parseArray(jsonObject.getString("vocList"), Word.class);
  235. if(words == null) {
  236. return null;
  237. }
  238. for (Word word : words) {
  239. // word.setBookId(bookId);
  240. // List<Word> wordList = wordMapper.selectAll();
  241. // for(Word word2:wordList) {
  242. // if(word2.getBookId()!=bookId && !word2.getSpelling().equals(word.getSpelling())) {
  243. // wordMapper.insert(word);
  244. // }
  245. // }
  246. word.setBookId(bookId);
  247. wordMapper.insert(word);
  248. }
  249. DBTools.getSession().commit();
  250. return words;
  251. }
  252. }