/WhuAtOSee/src/org/osee/crawl/service/impl/UrlServiceImpl.scala
http://oseeatwhu.googlecode.com/ · Scala · 251 lines · 142 code · 29 blank · 80 comment · 36 complexity · d7e57d3f9206852bf1b7e2b1df9c614c MD5 · raw file
- /*
- * UrlServiceImpl.scala
- *
- * Copyright @ OSee 2009
- *
- * @author zyy 2009-07-22 09:51:55
- */
-
- package org.osee.crawl.service.impl
-
- import org.osee.crawl.domain._
- import org.osee.crawl.dao._
- import org.osee.crawl.service.UrlService
-
- import java.util.regex._
- import java.util._
- import java.net._
-
- /*
- * @version 1.1
- *
- * @see UrlService
- */
- class UrlServiceImpl extends UrlService{
-
- @scala.reflect.BeanProperty
- var urlDao : UrlDao = null
-
- private var ipClassService = new IpClassServiceImpl
- private var pageService = new PageServiceImpl
- /**
- insert new url if the url 2 insert is an finance url
- @param url2Insert : CUrl
- */
- def insertUrl(url2Insert : CUrl): Unit = {
- if(url2Insert != null){
- urlDao.insertUrl(url2Insert)
- }
- }
-
- /**
- delete url
- @param url2Delete : CUrl
- */
- def deleteUrl(url2Delete : CUrl): Unit = {
- urlDao.deleteUrl(url2Delete)
- }
-
- /**
- get the url in database
- @param strUrl : String
- @return java.util.List[CUrl]
- */
- def getUrlByUrl(strUrl : String): CUrl = {
- urlDao.getUrlByUrl(strUrl)
- }
-
- /**
- get the unvisited urls
- @return java.util.List[CUrl]
- */
- def getUnvisitedUrls(): java.util.List[CUrl] = {
- urlDao.getUnvisitedUrls
- }
-
- /**
- update the url
- */
- def updateUrl(url2Update : CUrl) : Unit = {
- urlDao.updateUrl(url2Update)
- }
-
- /**
- get the page's last update
- @param url : String
- @return String
- */
- def getLastUpdate(url : String) : Long = {
- urlDao.getLastUpdate(url)
- }
-
- /**
- break an url in to scheme, host, port and request.
- @param strUrl : String
- */
- def getParsedUrl(strUrl : String) : CUrl = {
- var url2Parse = strUrl
- println("starting parsing url:" + strUrl)
- if(url2Parse.startsWith("http://") != true) {
- url2Parse = "http://" + strUrl
- }
-
- val url = new java.net.URL(strUrl)
-
- var curl = new CUrl
- var strHost = url.getHost
- val lastUpdate = getLastUpdateFromUrl(strUrl)
- if(isFinanceUrl(strHost) && lastUpdate != -1){
- if(strUrl.contains("messages.yahoo.com") && (!strUrl.contains("Business_%26_Finance")))
- {
- curl = null
- }else{
- curl.setStrUrl(strUrl)
- curl.setLastUpdate(lastUpdate)
- // println("get header field content0 = HTTP/1.0 200 OK")
- // println("get header field content1 = Sat, 18 Jul 2009 04:50:53 GMT" ) //the result shows that this value changable
- // println("get header field content2 = Apache/2.0.63 (Unix)")
- // println("get header field content3 = Fri, 17 Jul 2009 11:50:22 GMT" ) //the last type
-
- curl.setIntPort(url.getPort)
- println("url port:" + url.getPort)
- curl.setIsVisited(false)
- curl.setStrHost(url.getHost)
- println("url host:" + url.getHost)
-
- val ipAddr = getIpByHost(url.getHost)
- var ip = new IpClass
- ip.setStrIp(ipAddr)
-
- var ipList = new java.util.HashSet[IpClass] //create the ip address
- println("ip list:" + ipList)
- ipList.add(ip)
- println("add ip")
- curl.setStrIps(ipList)
- curl.setReferencedCount(1)
- }
- }else{
- curl = null
- }
- curl
- }
-
- /**
- get the page last update time from the url header
- @param strUrl : String
- @return Long
- */
- def getLastUpdateFromUrl(strUrl : String) : Long = {
- //get the page last updated
- println("get last modfy...")
- val conn = pageService.getConnByStrUrl(strUrl)
- println("got conn..")
- var lastUpdate : Long = 0
- if(conn != null){
- lastUpdate = conn.getLastModified
- }else{
- lastUpdate = -1
- }
- println("conn = "+conn+"\nlast update" + lastUpdate)
- lastUpdate
- }
-
- /**
- get the ip address by host name
- @param strHost : String
- @return String
- */
- def getIpByHost(strHost : String) : String = {
-
- if(isValidHost(strHost)){
- val inetAddr = java.net.InetAddress.getByName(strHost).getHostAddress
- inetAddr
- }else{
- null
- }
-
- }
-
- /**
- judge is valid host
- @param strHost : String
- @return Boolean
- */
- def isValidHost(strHost : String) : Boolean = {
- if(strHost.length < 6){
- false
- }else{
- true
- }
- }
-
- /**
- judge is finance url
- @param strHost : String
- @return Boolean
- */
- def isFinanceUrl(strUrl : String) : Boolean = {
- val sinaPattern =java.util.regex.Pattern.compile("(.*?)finance.sina.com.cn")
- val sinaBlogPattern = java.util.regex.Pattern.compile("(.*?)blog.sina.com.cn")
- val p5wPattern = java.util.regex.Pattern.compile("(.*?)p5w.net")
- val hexunPattern = java.util.regex.Pattern.compile("(.*?)hexun.com")
- val souhuPattern = java.util.regex.Pattern.compile("(.*?)business.sohu.com")
- var result = false
- if(sinaPattern.matcher(strUrl).matches && (result == false)){
- true
- }else if(sinaBlogPattern.matcher(strUrl).matches && (result == false)){
- true
- }else if(p5wPattern.matcher(strUrl).matches && (result == false)){
- true
- }else if(hexunPattern.matcher(strUrl).matches && (result == false)){
- true
- }else if((souhuPattern.matcher(strUrl).matches || strUrl.contains("club.stock.sohu.com")) && (result == false)){
- true
- }else if(strUrl.contains("yahoo.com") && (result == false)){
- true
- }else{
- false
- }
-
- }
-
- /**
- judge is the text plain url, not the pdf or mp3 or otherwise
- @param strUrl : String
- @return Boolean
- */
- def isTextUrl(strUrl : String) : Boolean = {
- val url = new java.net.URL(strUrl)
- val contentType = url.openConnection().getContentType
- if(contentType == "text/html"){
- true
- }else{
- false
- }
- }
-
- /**
- @param strUrl : String
- @return Boolean
- */
- def isVisitedUrl(strUrl : String) : Boolean = {
- val visitedList = getUrlByUrl(strUrl)
- if(visitedList != null){
- true
- }else{
- false
- }
- }
-
- /**
- is valid ip
- @param strIp : String
- @return Boolean
- */
- def isValidIp(strIp : String) : Boolean = {
- val pattern = java.util.regex.Pattern.compile("((25[0-5]|2[0-4]\\d|1?\\d?\\d)\\.){3}(25[0-5]|2[0-4]\\d|1?\\d?\\d)")
- pattern.matcher(strIp).matches
- }
-
-
- }