PageRenderTime 171ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/code/GPAssist/src/main/java/org/tomstools/html/fetcher/AgencyFetcherApp.java

https://github.com/lotomer/tomstools
Java | 212 lines | 162 code | 17 blank | 33 comment | 38 complexity | b5fd8e90039b23e95504efba27d2593a MD5 | raw file
  1. /**
  2. * copyright (a) 2010-2012 tomstools.org. All rights reserved.
  3. */
  4. package org.tomstools.html.fetcher;
  5. import java.net.MalformedURLException;
  6. import java.security.NoSuchAlgorithmException;
  7. import java.util.regex.Matcher;
  8. import java.util.regex.Pattern;
  9. import org.tomstools.common.log.Logger;
  10. import org.tomstools.common.util.MD5;
  11. import org.tomstools.common.util.Utils;
  12. import org.tomstools.html.Util.HTMLUtil;
  13. import org.tomstools.html.data.Agency;
  14. import org.tomstools.html.data.AgencyDAO;
  15. import org.tomstools.html.data.AgencyDeal;
  16. import org.tomstools.html.data.AgencyDealDAO;
  17. import org.tomstools.html.data.StockDeal;
  18. import org.tomstools.html.data.StockDealDAO;
  19. import com.alibaba.fastjson.JSONArray;
  20. import com.alibaba.fastjson.JSONObject;
  21. /**
  22. * @author lotomer
  23. * @date 2012-6-11
  24. * @time 下午02:09:32
  25. */
  26. public class AgencyFetcherApp {
  27. private static final Logger LOG =
  28. Logger.getLogger(AgencyFetcherApp.class);
  29. private AgencyDAO agency;
  30. private HTMLFetcher fetcher;
  31. private AgencyDealDAO agencyDeal;
  32. private MD5 md5;
  33. public AgencyFetcherApp() throws NoSuchAlgorithmException {
  34. md5 = new MD5();
  35. fetcher = new HTMLFetcher();
  36. }
  37. public AgencyFetcherApp(String proxyHost, int proxyPort, String proxyScheme) throws NoSuchAlgorithmException {
  38. if (Utils.isEmpty(proxyHost)) {
  39. fetcher = new HTMLFetcher();
  40. } else {
  41. fetcher = new HTMLFetcher(proxyHost, proxyPort, proxyScheme);
  42. }
  43. md5 = new MD5();
  44. }
  45. /**
  46. * 抓取页面
  47. */
  48. public void fetchAgency(String beginDate, String endDate) {
  49. // 第一步 获取每日股票龙虎榜数据
  50. // http://quotes.money.163.com/hs/marketdata/service/lhb.php?host=/hs/marketdata/service/lhb.php&page=0&query=start:2013-09-06;end:2013-09-06&fields=NO,SYMBOL,SNAME,TDATE,TCLOSE,PCHG,SMEBTSTOCK1,SYMBOL,VOTURNOVER,COMPAREA,VATURNOVER,SYMBOL&sort=TDATE&order=desc&count=150&type=query&initData=[object%20Object]&req=11511
  51. String urlStep1 = "http://quotes.money.163.com/hs/marketdata/service/lhb.php?host=/hs/marketdata/service/lhb.php&page=0&"
  52. + "query=start:%s;end:%s&fields=NO,SYMBOL,TDATE,SNAME,SMEBTSTOCK1&sort=TDATE&order=desc&count=250000&type=query&req=11511";
  53. // 第二步 逐个获取龙虎榜股票涉及的主力
  54. // http://quotes.money.163.com/hs/marketdata/mrlhbSub.php?clear=1202&symbol=000663&type=01&date=2013-09-06&width=920&height=500&modal=true&frame=true
  55. String urlStep2 = "http://quotes.money.163.com/hs/marketdata/mrlhbSub.php?clear=1202&"
  56. + "symbol=%s&type=%s&date=%s&width=920&height=500&modal=true";
  57. // 第三步 查询主力的交易明细
  58. // http://quotes.money.163.com/hs/marketdata/service/jglhb.php?host=/hs/marketdata/service/jglhb.php&page=0&query=agencysymbol:80138252;date:8&fields=NO,SYMBOL,SNAME,TDATE,SMEBTCOMPANY4,SMEBTCOMPANY5,SMEBTCOMPANY1&sort=TDATE&order=desc&count=25&type=query&req=11526
  59. // String urlStep3 =
  60. // "http://quotes.money.163.com/hs/marketdata/service/jglhb.php?host=/hs/marketdata/service/jglhb.php&page=0&query=agencysymbol:80138252;date:8&fields=NO,SYMBOL,SNAME,TDATE,SMEBTCOMPANY4,SMEBTCOMPANY5,SMEBTCOMPANY1&sort=TDATE&order=desc&count=250&type=query&req=11526";
  61. String htmlContent = fetcher.fetchHTMLContent(String.format(urlStep1, beginDate, endDate));
  62. //System.out.println(htmlContent);
  63. if (Utils.isEmpty(htmlContent)) {
  64. return;
  65. }
  66. JSONObject obj = (JSONObject) JSONObject.parse(htmlContent);
  67. LOG.info("total:" + obj.get("total"));
  68. JSONArray arr = obj.getJSONArray("list");
  69. LOG.info("fetch:"+arr.size());
  70. String host = HTMLUtil.getHost(urlStep2);
  71. LOG.info("host:"+host);
  72. agency = new AgencyDAO();
  73. agencyDeal = new AgencyDealDAO();
  74. StockDealDAO stockDeal = new StockDealDAO();
  75. for (int i = 0; i < arr.size(); ++i) {
  76. JSONObject o = (JSONObject) arr.get(i);
  77. htmlContent = fetcher.fetchHTMLContent(String.format(urlStep2, o.get("SYMBOL"),o.get("SMEBTSTOCK11"),
  78. o.get("TDATE")));
  79. parseSubUrls(htmlContent,host,o.get("SYMBOL").toString(),o.get("SNAME").toString(),o.get("TDATE").toString());
  80. stockDeal.add(new StockDeal(o.get("SYMBOL").toString(), o.get("SNAME").toString(), o.get("TDATE").toString(), o.get("TCLOSE").toString(), o.get("PCHG").toString()));
  81. }
  82. agency.save();
  83. // 先删除指定日期的交易数据,然后再添加
  84. stockDeal.clean(beginDate,endDate);
  85. stockDeal.save();
  86. // 先删除指定日期的交易数据,然后再添加
  87. agencyDeal.clean(beginDate,endDate);
  88. agencyDeal.save();
  89. }
  90. Pattern pattern = Pattern.compile("异动期内买入金额最大的前5名</td></tr>(.*?)</tbody>", Pattern.CASE_INSENSITIVE
  91. | Pattern.UNICODE_CASE | Pattern.DOTALL);
  92. Pattern valuePattern = Pattern.compile("<tr>\\s*?<[/]{0,1}td>(.*?)</td>\\s*?<td>(.*?)</td>\\s*?<td>(.*?)</td>\\s*?<td>(.*?)</td>", Pattern.CASE_INSENSITIVE
  93. | Pattern.UNICODE_CASE | Pattern.DOTALL);
  94. Pattern namePattern = Pattern.compile("<a .*?href='(/marketdata/agencylist_(.*?).html )'.*?>(.*?)</a>", Pattern.CASE_INSENSITIVE
  95. | Pattern.UNICODE_CASE | Pattern.DOTALL);
  96. private void parseSubUrls(String htmlContent,String host,String symbol,String sname,String tdate) {
  97. Matcher matcher = pattern.matcher(htmlContent);
  98. if (matcher.find()) {
  99. // 获取正文
  100. Matcher valueMatcher = valuePattern.matcher(matcher.group(1));
  101. while (valueMatcher.find()) {
  102. // 循环获取值
  103. Matcher nameMatcher = namePattern.matcher(valueMatcher.group(1));
  104. if (nameMatcher.find()){
  105. // 添加机构信息
  106. agency.add(new Agency(nameMatcher.group(2),nameMatcher.group(3),host +nameMatcher.group(1)));
  107. //添加交易数据
  108. agencyDeal.add(new AgencyDeal(nameMatcher.group(2), symbol, sname, tdate, valueMatcher.group(2).trim(), valueMatcher.group(4).trim()));
  109. }else{
  110. // 添加机构信息
  111. String name = HTMLUtil.removeTags(valueMatcher.group(1));
  112. agency.add(new Agency(md5.md5(name),name,""));
  113. //添加交易数据
  114. agencyDeal.add(new AgencyDeal(md5.md5(name), symbol, sname, tdate, valueMatcher.group(2).trim(), valueMatcher.group(4).trim()));
  115. //System.out.println(HTMLUtil.removeTags(valueMatcher.group(1))+":"+valueMatcher.group(2).trim()+":"+valueMatcher.group(4).trim());
  116. }
  117. }
  118. }
  119. }
  120. private static void printHelp() {
  121. System.out.println("Usage: AgencyDealDataFetcherApp [options] agencySymbol");
  122. System.out.println("Options are:");
  123. System.out.println(" -b beginDate yyyy-MM-dd. Like 2013-09-01.");
  124. System.out.println(" -e endDate yyyy-MM-dd. Like 2013-09-01.");
  125. System.out.println(" -host proxyHost The proxy host.");
  126. System.out.println(" -port proxyPort The proxy port. Default is 8087");
  127. System.out.println(" -scheme proxyScheme The proxy scheme. Default is http.");
  128. System.out.println(" -h help Print the help message.");
  129. }
  130. public static void main(String[] args) throws MalformedURLException, NoSuchAlgorithmException {
  131. String beginDate = null;// "2013-09-06";
  132. String endDate = null;// "2013-09-06";
  133. String proxyHost = null;// "127.0.0.1";
  134. int proxyPort = 8087;
  135. String proxyScheme = "http";
  136. for (int i = 0; i < args.length; ++i) {
  137. if ("-b".equals(args[i])) {
  138. if (++i == args.length) {
  139. System.err.println("-b beginDate yyyy-MM-dd. Like 2013-09-01");
  140. System.exit(-1);
  141. } else {
  142. beginDate = args[i];
  143. }
  144. } else if ("-e".equals(args[i])) {
  145. if (++i == args.length) {
  146. System.err.println("-e endDate yyyy-MM-dd. Like 2013-09-01");
  147. System.exit(-1);
  148. } else {
  149. endDate = args[i];
  150. }
  151. } else if ("-host".equals(args[i])) {
  152. if (++i == args.length) {
  153. System.err.println("-host proxyHost. ");
  154. System.exit(-1);
  155. } else {
  156. proxyHost = args[i];
  157. }
  158. } else if ("-port".equals(args[i])) {
  159. if (++i == args.length) {
  160. System.err.println("-port proxyPort. Default is 8087");
  161. System.exit(-1);
  162. } else {
  163. proxyPort = Integer.valueOf(args[i]);
  164. }
  165. } else if ("-scheme".equals(args[i])) {
  166. if (++i == args.length) {
  167. System.err.println("-scheme proxyScheme. Default is http");
  168. System.exit(-1);
  169. } else {
  170. proxyScheme = args[i];
  171. }
  172. } else if ("-h".equals(args[i])) {
  173. printHelp();
  174. }
  175. }
  176. // for test begin
  177. //beginDate = "2013-01-01";
  178. //endDate = "2013-09-10";
  179. // for test end
  180. if (Utils.isEmpty(beginDate) && Utils.isEmpty(endDate)) {
  181. System.err.println("The beginDate and endDate cannot be both empty!");
  182. System.exit(-1);
  183. }
  184. AgencyFetcherApp fetcher = new AgencyFetcherApp(proxyHost, proxyPort, proxyScheme);
  185. long start = System.currentTimeMillis();
  186. if (Utils.isEmpty(beginDate)) {
  187. beginDate = endDate;
  188. }
  189. if (Utils.isEmpty(endDate)) {
  190. endDate = beginDate;
  191. }
  192. fetcher.fetchAgency(beginDate, endDate);
  193. System.out.println("Total cost: " + (System.currentTimeMillis() - start) + "ms.");
  194. }
  195. }