PageRenderTime 109ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/scala/com/cloudwick/sync/jobs/fetcher/ProcessJobPosting.scala

https://gitlab.com/thugside/sync
Scala | 176 lines | 144 code | 19 blank | 13 comment | 3 complexity | 8e236c0f41fe372d0f364a923118cf0a MD5 | raw file
  1. package com.cloudwick.sync.jobs.fetcher
  2. import java.io.{InputStreamReader, BufferedReader}
  3. import java.net.{HttpURLConnection, URL}
  4. import akka.actor.Actor
  5. import akka.event.Logging
  6. import com.mongodb.casbah.MongoCollection
  7. import com.mongodb.casbah.commons.MongoDBObject
  8. import com.mongodb.casbah.Imports._
  9. import com.mongodb.casbah.commons.conversions.scala._
  10. import org.joda.time.{DateTimeZone, DateTime}
  11. import org.joda.time.format.DateTimeFormat
  12. import scalaj.http.{Http, HttpOptions, HttpResponse}
  13. /**
  14. * Actor to process each job url to parse and enrich the posting (alias: pr[0-n] -> processrequest)
  15. * @param url job url to process
  16. * @param uid unique id for the actor processing this job url
  17. * @param collection mongo collection object to insert the processed job's to
  18. * @author ashrith
  19. */
  20. class ProcessJobPosting(url: String,
  21. uid: Int,
  22. collection: MongoCollection) extends Actor {
  23. val log = Logging(context.system, this)
  24. val skillsPattern = """\s+<dt.*>Skills:<\/dt>\s+<dd.*>(.*)<\/dd>""".r
  25. val emailPattern = """\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b""".r
  26. val phonePattern = """(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?""".r
  27. RegisterJodaTimeConversionHelpers()
  28. val dateFormat = DateTimeFormat.forPattern("yyyy-MM-dd").withZone(DateTimeZone.forID("UTC"))
  29. override def preStart() = {
  30. log.debug("Starting ProcessRequest")
  31. }
  32. override def preRestart(reason: Throwable, message: Option[Any]): Unit = {
  33. log.error(reason, "Restarting due to [{}] when processing [{}] (url: {})",
  34. reason.getMessage, message.getOrElse(""), url)
  35. }
  36. def toWords(lines: List[String]) = lines flatMap { line =>
  37. "[a-zA-Z]+".r findAllIn line map (_.toLowerCase)
  38. }
  39. def handleRequest(uRL: String): HttpURLConnection = {
  40. var conn = new URL(uRL).openConnection().asInstanceOf[HttpURLConnection]
  41. conn.setReadTimeout(50000)
  42. conn.setConnectTimeout(10000)
  43. val status = conn.getResponseCode
  44. status match {
  45. case HttpURLConnection.HTTP_OK =>
  46. conn
  47. case HttpURLConnection.HTTP_MOVED_TEMP | HttpURLConnection.HTTP_MOVED_PERM |
  48. HttpURLConnection.HTTP_SEE_OTHER => {
  49. // handle redirection
  50. conn = handleRequest(conn.getHeaderField("Location"))
  51. conn
  52. }
  53. }
  54. }
  55. def processRequest(uRL: String): String = {
  56. val conn = handleRequest(uRL)
  57. val in: BufferedReader = new BufferedReader(new InputStreamReader(conn.getInputStream))
  58. Stream.continually(in.readLine()).takeWhile(_ != null).mkString("\n")
  59. }
  60. def keepPosting(content: String, searchTerm: String): Boolean = {
  61. content contains searchTerm
  62. }
  63. def getSkills(content: String): List[String] = {
  64. skillsPattern.findFirstMatchIn(content).map(_ group 1) match {
  65. case Some(skills) => (skills replaceAll("&nbsp;", "")).split(",").toList
  66. case None => List()
  67. }
  68. }
  69. def getEmails(content: String): List[String] = {
  70. emailPattern
  71. .findAllMatchIn(content)
  72. .map(s => s.toString())
  73. .toList
  74. .filter(!_.equals("email@domain.com"))
  75. .distinct
  76. }
  77. def getPhones(content: String): List[String] = {
  78. phonePattern
  79. .findAllMatchIn(content)
  80. .map(s => s.toString())
  81. .toList
  82. .filter(_.length > 10)
  83. .distinct
  84. }
  85. def receive = {
  86. case Messages.Job(iUrl, title, company, location, date, searchTerm, grepWord) =>
  87. log.debug("Processing [{}]", iUrl)
  88. val qDate = DateTime.parse(date, dateFormat)
  89. try {
  90. val urlContent = processRequest(iUrl)
  91. if (keepPosting(urlContent, grepWord)) {
  92. val skills = getSkills(urlContent)
  93. val emails = getEmails(urlContent)
  94. val phoneNums = getPhones(urlContent)
  95. // splat everything for _keywords
  96. val keywords = toWords(skills) :::
  97. toWords(emails) :::
  98. toWords(List(title)) :::
  99. toWords(List(company)) :::
  100. toWords(List(location))
  101. val sObj = MongoDBObject("url" -> url)
  102. collection.findOne(sObj) match {
  103. case Some(obj) =>
  104. val dObj = sObj ++ ("date_posted" -> qDate)
  105. collection.findOne(dObj) match {
  106. // check if the url exists with same date posted then ignore it
  107. case Some(o) =>
  108. log.info("Document with url: [{}] and date: [{}] already exists", url, date)
  109. sender() ! Messages.JobUrlDuplicate
  110. case None =>
  111. log.info("Repeated job posting fond: [{}]", url)
  112. val existingDate = obj("date_posted")
  113. val update = MongoDBObject(
  114. "$set" -> MongoDBObject("date_posted" -> qDate, "repeated" -> true),
  115. "$addToSet" -> MongoDBObject("pdates" -> existingDate)
  116. )
  117. collection.update(sObj, update)
  118. sender() ! Messages.JobUrlRepeated
  119. }
  120. case None =>
  121. // construct a mongo object to insert
  122. val iObj = MongoDBObject(
  123. "url" -> iUrl,
  124. "link_active" -> true,
  125. "date_posted" -> qDate,
  126. "search_term" -> searchTerm,
  127. "source" -> "DICE", // TODO replace this with the parameter
  128. "title" -> title,
  129. "company" -> company,
  130. "location" -> location,
  131. "skills" -> skills,
  132. "emails" -> emails,
  133. "phone_nums" -> phoneNums,
  134. "read" -> false,
  135. "hide" -> false,
  136. "_keywords" -> keywords,
  137. "version" -> 2
  138. )
  139. log.info("Inserting: [{}]", iObj.toString)
  140. collection.insert(iObj)
  141. // Update the sender and notify that job url processing completed
  142. sender() ! Messages.JobUrlInserted
  143. }
  144. } else {
  145. log.debug("Skipping [{}] as grep_term:'{}' not found", iUrl, grepWord)
  146. sender() ! Messages.JobUrlSkipped
  147. }
  148. } catch {
  149. case ex: Exception =>
  150. // exception as the first arg will print the stack trace
  151. log.error(ex, "Failed parsing url: [{}] because of [{}]", iUrl, ex.getMessage)
  152. sender() ! Messages.JobUrlFailed
  153. }
  154. case x =>
  155. log.warning("Message not recognized: [{}]. Path: [{}]", x, sender().path)
  156. }
  157. }