/WhuAtOSee/src/org/osee/crawl/service/impl/UrlServiceImpl.scala

http://oseeatwhu.googlecode.com/ · Scala · 251 lines · 142 code · 29 blank · 80 comment · 36 complexity · d7e57d3f9206852bf1b7e2b1df9c614c MD5 · raw file

  1. /*
  2. * UrlServiceImpl.scala
  3. *
  4. * Copyright @ OSee 2009
  5. *
  6. * @author zyy 2009-07-22 09:51:55
  7. */
  8. package org.osee.crawl.service.impl
  9. import org.osee.crawl.domain._
  10. import org.osee.crawl.dao._
  11. import org.osee.crawl.service.UrlService
  12. import java.util.regex._
  13. import java.util._
  14. import java.net._
  15. /*
  16. * @version 1.1
  17. *
  18. * @see UrlService
  19. */
  20. class UrlServiceImpl extends UrlService{
  21. @scala.reflect.BeanProperty
  22. var urlDao : UrlDao = null
  23. private var ipClassService = new IpClassServiceImpl
  24. private var pageService = new PageServiceImpl
  25. /**
  26. insert new url if the url 2 insert is an finance url
  27. @param url2Insert : CUrl
  28. */
  29. def insertUrl(url2Insert : CUrl): Unit = {
  30. if(url2Insert != null){
  31. urlDao.insertUrl(url2Insert)
  32. }
  33. }
  34. /**
  35. delete url
  36. @param url2Delete : CUrl
  37. */
  38. def deleteUrl(url2Delete : CUrl): Unit = {
  39. urlDao.deleteUrl(url2Delete)
  40. }
  41. /**
  42. get the url in database
  43. @param strUrl : String
  44. @return java.util.List[CUrl]
  45. */
  46. def getUrlByUrl(strUrl : String): CUrl = {
  47. urlDao.getUrlByUrl(strUrl)
  48. }
  49. /**
  50. get the unvisited urls
  51. @return java.util.List[CUrl]
  52. */
  53. def getUnvisitedUrls(): java.util.List[CUrl] = {
  54. urlDao.getUnvisitedUrls
  55. }
  56. /**
  57. update the url
  58. */
  59. def updateUrl(url2Update : CUrl) : Unit = {
  60. urlDao.updateUrl(url2Update)
  61. }
  62. /**
  63. get the page's last update
  64. @param url : String
  65. @return String
  66. */
  67. def getLastUpdate(url : String) : Long = {
  68. urlDao.getLastUpdate(url)
  69. }
  70. /**
  71. break an url in to scheme, host, port and request.
  72. @param strUrl : String
  73. */
  74. def getParsedUrl(strUrl : String) : CUrl = {
  75. var url2Parse = strUrl
  76. println("starting parsing url:" + strUrl)
  77. if(url2Parse.startsWith("http://") != true) {
  78. url2Parse = "http://" + strUrl
  79. }
  80. val url = new java.net.URL(strUrl)
  81. var curl = new CUrl
  82. var strHost = url.getHost
  83. val lastUpdate = getLastUpdateFromUrl(strUrl)
  84. if(isFinanceUrl(strHost) && lastUpdate != -1){
  85. if(strUrl.contains("messages.yahoo.com") && (!strUrl.contains("Business_%26_Finance")))
  86. {
  87. curl = null
  88. }else{
  89. curl.setStrUrl(strUrl)
  90. curl.setLastUpdate(lastUpdate)
  91. // println("get header field content0 = HTTP/1.0 200 OK")
  92. // println("get header field content1 = Sat, 18 Jul 2009 04:50:53 GMT" ) //the result shows that this value changable
  93. // println("get header field content2 = Apache/2.0.63 (Unix)")
  94. // println("get header field content3 = Fri, 17 Jul 2009 11:50:22 GMT" ) //the last type
  95. curl.setIntPort(url.getPort)
  96. println("url port:" + url.getPort)
  97. curl.setIsVisited(false)
  98. curl.setStrHost(url.getHost)
  99. println("url host:" + url.getHost)
  100. val ipAddr = getIpByHost(url.getHost)
  101. var ip = new IpClass
  102. ip.setStrIp(ipAddr)
  103. var ipList = new java.util.HashSet[IpClass] //create the ip address
  104. println("ip list:" + ipList)
  105. ipList.add(ip)
  106. println("add ip")
  107. curl.setStrIps(ipList)
  108. curl.setReferencedCount(1)
  109. }
  110. }else{
  111. curl = null
  112. }
  113. curl
  114. }
  115. /**
  116. get the page last update time from the url header
  117. @param strUrl : String
  118. @return Long
  119. */
  120. def getLastUpdateFromUrl(strUrl : String) : Long = {
  121. //get the page last updated
  122. println("get last modfy...")
  123. val conn = pageService.getConnByStrUrl(strUrl)
  124. println("got conn..")
  125. var lastUpdate : Long = 0
  126. if(conn != null){
  127. lastUpdate = conn.getLastModified
  128. }else{
  129. lastUpdate = -1
  130. }
  131. println("conn = "+conn+"\nlast update" + lastUpdate)
  132. lastUpdate
  133. }
  134. /**
  135. get the ip address by host name
  136. @param strHost : String
  137. @return String
  138. */
  139. def getIpByHost(strHost : String) : String = {
  140. if(isValidHost(strHost)){
  141. val inetAddr = java.net.InetAddress.getByName(strHost).getHostAddress
  142. inetAddr
  143. }else{
  144. null
  145. }
  146. }
  147. /**
  148. judge is valid host
  149. @param strHost : String
  150. @return Boolean
  151. */
  152. def isValidHost(strHost : String) : Boolean = {
  153. if(strHost.length < 6){
  154. false
  155. }else{
  156. true
  157. }
  158. }
  159. /**
  160. judge is finance url
  161. @param strHost : String
  162. @return Boolean
  163. */
  164. def isFinanceUrl(strUrl : String) : Boolean = {
  165. val sinaPattern =java.util.regex.Pattern.compile("(.*?)finance.sina.com.cn")
  166. val sinaBlogPattern = java.util.regex.Pattern.compile("(.*?)blog.sina.com.cn")
  167. val p5wPattern = java.util.regex.Pattern.compile("(.*?)p5w.net")
  168. val hexunPattern = java.util.regex.Pattern.compile("(.*?)hexun.com")
  169. val souhuPattern = java.util.regex.Pattern.compile("(.*?)business.sohu.com")
  170. var result = false
  171. if(sinaPattern.matcher(strUrl).matches && (result == false)){
  172. true
  173. }else if(sinaBlogPattern.matcher(strUrl).matches && (result == false)){
  174. true
  175. }else if(p5wPattern.matcher(strUrl).matches && (result == false)){
  176. true
  177. }else if(hexunPattern.matcher(strUrl).matches && (result == false)){
  178. true
  179. }else if((souhuPattern.matcher(strUrl).matches || strUrl.contains("club.stock.sohu.com")) && (result == false)){
  180. true
  181. }else if(strUrl.contains("yahoo.com") && (result == false)){
  182. true
  183. }else{
  184. false
  185. }
  186. }
  187. /**
  188. judge is the text plain url, not the pdf or mp3 or otherwise
  189. @param strUrl : String
  190. @return Boolean
  191. */
  192. def isTextUrl(strUrl : String) : Boolean = {
  193. val url = new java.net.URL(strUrl)
  194. val contentType = url.openConnection().getContentType
  195. if(contentType == "text/html"){
  196. true
  197. }else{
  198. false
  199. }
  200. }
  201. /**
  202. @param strUrl : String
  203. @return Boolean
  204. */
  205. def isVisitedUrl(strUrl : String) : Boolean = {
  206. val visitedList = getUrlByUrl(strUrl)
  207. if(visitedList != null){
  208. true
  209. }else{
  210. false
  211. }
  212. }
  213. /**
  214. is valid ip
  215. @param strIp : String
  216. @return Boolean
  217. */
  218. def isValidIp(strIp : String) : Boolean = {
  219. val pattern = java.util.regex.Pattern.compile("((25[0-5]|2[0-4]\\d|1?\\d?\\d)\\.){3}(25[0-5]|2[0-4]\\d|1?\\d?\\d)")
  220. pattern.matcher(strIp).matches
  221. }
  222. }