PageRenderTime 113ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/bookie/lib/importer.py

https://github.com/gregmalcolm/Bookie
Python | 201 lines | 174 code | 10 blank | 17 comment | 2 complexity | 62da8a571de2909ffd68008da6c114ad MD5 | raw file
  1. """Importers for bookmarks"""
  2. from datetime import datetime
  3. from BeautifulSoup import BeautifulSoup
  4. from bookie.models import BmarkMgr
  5. class Importer(object):
  6. """The actual factory object we use for handling imports"""
  7. def __init__(self, import_io):
  8. """work on getting an importer instance"""
  9. self.file_handle = import_io
  10. def __new__(cls, *args, **kwargs):
  11. """Overriding new we return a subclass based on the file content"""
  12. if DelImporter.can_handle(args[0]):
  13. return super(Importer, cls).__new__(DelImporter)
  14. if GBookmarkImporter.can_handle(args[0]):
  15. return super(Importer, cls).__new__(GBookmarkImporter)
  16. return super(Importer, cls).__new__(Importer)
  17. @staticmethod
  18. def can_handle(file_io):
  19. """This is meant to be implemented in subclasses"""
  20. raise NotImplementedError("Please implement this in your importer")
  21. def process(self, fulltext=None):
  22. """Meant to be implemented in subclasses"""
  23. raise NotImplementedError("Please implement this in your importer")
  24. def save_bookmark(self, url, desc, ext, tags, dt=None, fulltext=None):
  25. """Save the bookmark to the db
  26. :param url: bookmark url
  27. :param desc: one line description
  28. :param ext: extended description/notes
  29. :param tags: The string of tags to store with this bmark
  30. :param mark: Instance of Bmark that we're storing to db
  31. :param fulltext: Fulltext handler instance used to store that info
  32. """
  33. BmarkMgr.store(url, desc, ext, tags, dt=dt, fulltext=fulltext)
  34. class DelImporter(Importer):
  35. """Process a delicious html file"""
  36. @staticmethod
  37. def _is_delicious_format(soup, can_handle, delicious_doctype):
  38. """A check for if this import files is a delicious format compat file
  39. Very fragile currently, it makes sure the first line is the doctype.
  40. Any blank lines before it will cause it to fail
  41. """
  42. if soup.contents \
  43. and soup.contents[0] == delicious_doctype \
  44. and not soup.find('h3'):
  45. can_handle = True
  46. return can_handle
  47. @staticmethod
  48. def can_handle(file_io):
  49. """Check if this file is a google bookmarks format file
  50. In order to check the file we have to read it and check it's content
  51. type.
  52. Google Bookmarks and Delicious both have the same content type, but
  53. they use different formats. We use the fact that Google Bookmarks
  54. uses <h3> tags and Delicious does not in order to differentiate these
  55. two formats.
  56. """
  57. delicious_doctype = "DOCTYPE NETSCAPE-Bookmark-file-1"
  58. soup = BeautifulSoup(file_io)
  59. can_handle = False
  60. can_handle = DelImporter._is_delicious_format(soup,
  61. can_handle,
  62. delicious_doctype)
  63. # make sure we reset the file_io object so that we can use it again
  64. file_io.seek(0)
  65. return can_handle
  66. def process(self, fulltext=None):
  67. """Given a file, process it"""
  68. soup = BeautifulSoup(self.file_handle)
  69. for tag in soup.findAll('dt'):
  70. # if we have a dd as next sibling, get it's content
  71. if tag.nextSibling and tag.nextSibling.name == 'dd':
  72. extended = tag.nextSibling.text
  73. else:
  74. extended = ""
  75. link = tag.a
  76. add_date = datetime.fromtimestamp(float(link['add_date']))
  77. self.save_bookmark(link['href'],
  78. link.text,
  79. extended,
  80. " ".join(link['tags'].split(',')),
  81. dt=add_date,
  82. fulltext=fulltext)
  83. class GBookmarkImporter(Importer):
  84. """Process a Google Bookmark export html file"""
  85. @staticmethod
  86. def _is_google_format(soup, gbookmark_doctype, can_handle):
  87. """Verify that this import file is in the google export format
  88. Google only puts one tag at a time and needs to be looped through to
  89. get them all. See the sample files in the test_importer directory
  90. """
  91. if soup.contents \
  92. and soup.contents[0] == gbookmark_doctype \
  93. and soup.find('h3'):
  94. can_handle = True
  95. return can_handle
  96. @staticmethod
  97. def can_handle(file_io):
  98. """Check if this file is a google bookmarks format file
  99. In order to check the file we have to read it and check it's content
  100. type
  101. Google Bookmarks and Delicious both have the same content type, but
  102. they use different formats. We use the fact that Google Bookmarks
  103. uses <h3> tags and Delicious does not in order to differentiate these
  104. two formats.
  105. """
  106. soup = BeautifulSoup(file_io)
  107. can_handle = False
  108. gbookmark_doctype = "DOCTYPE NETSCAPE-Bookmark-file-1"
  109. can_handle = GBookmarkImporter._is_google_format(soup,
  110. gbookmark_doctype,
  111. can_handle)
  112. # make sure we reset the file_io object so that we can use it again
  113. file_io.seek(0)
  114. return can_handle
  115. def process(self, fulltext=None):
  116. """Process an html google bookmarks export and import them into bookie
  117. The export format is a tag as a heading, with urls that have that tag
  118. under that heading. If a url has N tags, it will appear N times, once
  119. under each heading.
  120. """
  121. soup = BeautifulSoup(self.file_handle)
  122. if not soup.contents[0] == "DOCTYPE NETSCAPE-Bookmark-file-1":
  123. raise Exception("File is not a google bookmarks file")
  124. urls = dict() # url:url_metadata
  125. # we don't want to just import all the available urls, since each url
  126. # occurs once per tag. loop through and aggregate the tags for each url
  127. for tag in soup.findAll('h3'):
  128. links = tag.findNextSibling('dl').findAll("a")
  129. for link in links:
  130. url = link["href"]
  131. tag_text = tag.text.replace(" ", "-")
  132. if url in urls:
  133. urls[url]['tags'].append(tag_text)
  134. else:
  135. tags = [tag_text] if tag_text != 'Unlabeled' else []
  136. # get extended description
  137. has_extended = (link.parent.nextSibling and
  138. link.parent.nextSibling.name == 'dd')
  139. if has_extended:
  140. extended = link.parent.nextSibling.text
  141. else:
  142. extended = ""
  143. # date the site was bookmarked
  144. timestamp_added = float(link['add_date']) / 1e6
  145. urls[url] = {
  146. 'description': link.text,
  147. 'tags': tags,
  148. 'extended': extended,
  149. 'date_added': datetime.fromtimestamp(timestamp_added),
  150. }
  151. # save the bookmark
  152. for url, metadata in urls.items():
  153. self.save_bookmark(url,
  154. metadata['description'],
  155. metadata['extended'],
  156. " ".join(metadata['tags']),
  157. dt=metadata['date_added'],
  158. fulltext=fulltext)