PageRenderTime 51ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/scala/code/redpanda/Fetcher.scala

https://github.com/mikedorseyjr/redpanda
Scala | 122 lines | 79 code | 21 blank | 22 comment | 3 complexity | f0b08a0d3af402c4e6998f4db9e706dc MD5 | raw file
  1. // The purpose of this package is to contain a number of classes for fetcher (job source) types.
  2. // There will probably be a factory method that iterates through all of the various
  3. // fetcher types and spawns out a fetcher that returns all jobs to a massager.
  4. // To be refactored and made more generic for readability.
  5. import com.mongodb._
  6. import com.mongodb.casbah.Imports._
  7. import com.mongodb.casbah.commons.conversions.scala._
  8. import com.mongodb.casbah.query
  9. import java.net.{URLConnection, URL}
  10. import scala.xml._
  11. import scala.io._
  12. package code.redpanda.data_fetching {
  13. import code.redpanda.data_fetching.Fetcher.RssFetcher
  14. // For now this base class defines a fetch and an xmlFetch. I'd be much happier if I could come up with
  15. // a base type for XML that had the XML parsing while also allowing me to return lists of some other type
  16. // I may be using in the future.
  17. abstract class Fetcher( val url: String, val fetch_type: String ){
  18. def fetch():List[String] // On a base fetcher, we get an empty list of jobs to massage
  19. def xmlFetch():Elem
  20. }
  21. // This object is a factory object. It is used for creating various types of fetchers.
  22. object Fetcher {
  23. // RssFetcher returns a fetcher that can return XML for parsing on fetch.
  24. private class RssFetcher( val r_url: String) extends Fetcher(r_url, "RSS"){
  25. // XML implementation that allows us to use the Scala XML classes for parsing
  26. override def xmlFetch():Elem = {
  27. val n_url = new URL(r_url)
  28. val conn = n_url.openConnection
  29. XML.load(conn.getInputStream)
  30. }
  31. override def fetch():List[String] = { List("") } // No implementation here
  32. }
  33. // This method takes the fetch type and returns an appropriate fetcher for the type in particular.
  34. def createFetcher( fetch_type: String, url: String):Fetcher ={
  35. fetch_type match {
  36. case "rss" => new RssFetcher(url)
  37. case _ => sys.error("Unknown option.")
  38. }
  39. }
  40. }
  41. // This fetcher is defined for various sites we may be acquiring job data from. It is a base class that contains
  42. // an implementation that all of the other site fetchers will use. This class will probably end up abstract after
  43. // a while and moved into a factory method. Don't know yet though.
  44. class SiteFetcher( val j_sites : MongoCollection, val j_docs : MongoCollection, val f_group: String, val f_types : List[String] ){
  45. val fetch_group = f_group;
  46. val fetch_types = f_types;
  47. def fetchJobs()
  48. {
  49. // Fetch all of the collection sites for our fetch group.
  50. val f_object = MongoDBObject("fetch_group" -> f_group)
  51. // Iterate through all of the sites
  52. for ( l <- j_sites.find(f_object)){
  53. //if ( l("fetch_type").toString == "rss"){
  54. if ( fetch_types.contains(l("fetch_type").toString)){
  55. // Process through the acceptable fetch types that our class uses and move all of that data as well.
  56. job_process(l)
  57. }
  58. }
  59. }
  60. // Empty implementation. Every subclass will implement their own.
  61. def job_process( entry: MongoDBObject )
  62. {
  63. }
  64. }
  65. // This fetcher is used for processing Dice data
  66. class DiceFetcher( val sites : MongoCollection, val job_docs : MongoCollection ) extends SiteFetcher(sites, job_docs, "dice", List("rss")){
  67. // Note for functionality, we only have to define how we process a job and add it to our job_docs mongo collection.
  68. override def job_process( entry: MongoDBObject )
  69. {
  70. val l = entry;
  71. // Implement factory pattern and use it to fetch RSS feed and other based job listings
  72. var j_link = l("url").toString
  73. val fetch_type = l("fetch_type").toString
  74. val location = l("location").toString
  75. val r_fetch = Fetcher.createFetcher(fetch_type, j_link)
  76. val job_listing = r_fetch.xmlFetch()
  77. for ( job_entry <- (job_listing\\"item")){
  78. var title = (job_entry\"title").text
  79. var link = (job_entry\"link").text
  80. // Now fetch the URL in the site item and make that the body of the job entry
  81. // Check to make sure we don't have any job entries with this link already.
  82. val find_entry = MongoDBObject("title" -> title,
  83. "link" -> link,
  84. "region" -> location)
  85. val job_count = job_docs.count(find_entry)
  86. if (job_count <= 0){
  87. var j_url = new URL(link)
  88. // The below doesn't work. Maybe the java classes don't like real URLs with form variables
  89. // and the like. This needs to be figured out.
  90. var j_body = scala.io.Source.fromURL(j_url).mkString
  91. var j_save = MongoDBObject("title" -> title,
  92. "link" -> link,
  93. "body" -> j_body.toString(),
  94. "region" -> location);
  95. job_docs.save(j_save)
  96. }
  97. }
  98. }
  99. }
  100. }