importer.py | searchcode

/bookie/lib/importer.py

https://github.com/gregmalcolm/Bookie · Python · 201 lines · 156 code · 16 blank · 29 comment · 11 complexity · 62da8a571de2909ffd68008da6c114ad MD5 · raw file

"""Importers for bookmarks"""
from datetime import datetime
from BeautifulSoup import BeautifulSoup
from bookie.models import BmarkMgr


class Importer(object):
    """The actual factory object we use for handling imports"""

    def __init__(self, import_io):
        """work on getting an importer instance"""
        self.file_handle = import_io

    def __new__(cls, *args, **kwargs):
        """Overriding new we return a subclass based on the file content"""
        if DelImporter.can_handle(args[0]):
            return super(Importer, cls).__new__(DelImporter)

        if GBookmarkImporter.can_handle(args[0]):
            return super(Importer, cls).__new__(GBookmarkImporter)

        return super(Importer, cls).__new__(Importer)

    @staticmethod
    def can_handle(file_io):
        """This is meant to be implemented in subclasses"""
        raise NotImplementedError("Please implement this in your importer")

    def process(self, fulltext=None):
        """Meant to be implemented in subclasses"""
        raise NotImplementedError("Please implement this in your importer")

    def save_bookmark(self, url, desc, ext, tags, dt=None, fulltext=None):
        """Save the bookmark to the db

        :param url: bookmark url
        :param desc: one line description
        :param ext: extended description/notes
        :param tags: The string of tags to store with this bmark
        :param mark: Instance of Bmark that we're storing to db
        :param fulltext: Fulltext handler instance used to store that info

        """
        BmarkMgr.store(url, desc, ext, tags, dt=dt, fulltext=fulltext)


class DelImporter(Importer):
    """Process a delicious html file"""

    @staticmethod
    def _is_delicious_format(soup, can_handle, delicious_doctype):
        """A check for if this import files is a delicious format compat file

        Very fragile currently, it makes sure the first line is the doctype.
        Any blank lines before it will cause it to fail

        """
        if soup.contents \
           and soup.contents[0] == delicious_doctype \
           and not soup.find('h3'):
            can_handle = True

        return can_handle

    @staticmethod
    def can_handle(file_io):
        """Check if this file is a google bookmarks format file

        In order to check the file we have to read it and check it's content
        type.

        Google Bookmarks and Delicious both have the same content type, but
        they use different formats. We use the fact that Google Bookmarks
        uses <h3> tags and Delicious does not in order to differentiate these
        two formats.
        """
        delicious_doctype = "DOCTYPE NETSCAPE-Bookmark-file-1"

        soup = BeautifulSoup(file_io)
        can_handle = False
        can_handle = DelImporter._is_delicious_format(soup,
                                                   can_handle,
                                                   delicious_doctype)

        # make sure we reset the file_io object so that we can use it again
        file_io.seek(0)
        return can_handle

    def process(self, fulltext=None):
        """Given a file, process it"""
        soup = BeautifulSoup(self.file_handle)

        for tag in soup.findAll('dt'):
            # if we have a dd as next sibling, get it's content
            if tag.nextSibling and tag.nextSibling.name == 'dd':
                extended = tag.nextSibling.text
            else:
                extended = ""

            link = tag.a
            add_date = datetime.fromtimestamp(float(link['add_date']))

            self.save_bookmark(link['href'],
                               link.text,
                               extended,
                               " ".join(link['tags'].split(',')),
                               dt=add_date,
                               fulltext=fulltext)


class GBookmarkImporter(Importer):
    """Process a Google Bookmark export html file"""

    @staticmethod
    def _is_google_format(soup, gbookmark_doctype, can_handle):
        """Verify that this import file is in the google export format

        Google only puts one tag at a time and needs to be looped through to
        get them all. See the sample files in the test_importer directory

        """
        if soup.contents \
           and soup.contents[0] == gbookmark_doctype \
           and soup.find('h3'):
            can_handle = True

        return can_handle

    @staticmethod
    def can_handle(file_io):
        """Check if this file is a google bookmarks format file

        In order to check the file we have to read it and check it's content
        type

        Google Bookmarks and Delicious both have the same content type, but
        they use different formats. We use the fact that Google Bookmarks
        uses <h3> tags and Delicious does not in order to differentiate these
        two formats.
        """
        soup = BeautifulSoup(file_io)
        can_handle = False
        gbookmark_doctype = "DOCTYPE NETSCAPE-Bookmark-file-1"
        can_handle = GBookmarkImporter._is_google_format(soup,
                                                         gbookmark_doctype,
                                                         can_handle)

        # make sure we reset the file_io object so that we can use it again
        file_io.seek(0)
        return can_handle

    def process(self, fulltext=None):
        """Process an html google bookmarks export and import them into bookie
        The export format is a tag as a heading, with urls that have that tag
        under that heading. If a url has N tags, it will appear N times, once
        under each heading.
        """
        soup = BeautifulSoup(self.file_handle)
        if not soup.contents[0] == "DOCTYPE NETSCAPE-Bookmark-file-1":
            raise Exception("File is not a google bookmarks file")

        urls = dict()  # url:url_metadata

        # we don't want to just import all the available urls, since each url
        # occurs once per tag. loop through and aggregate the tags for each url
        for tag in soup.findAll('h3'):
            links = tag.findNextSibling('dl').findAll("a")
            for link in links:
                url = link["href"]
                tag_text = tag.text.replace(" ", "-")
                if url in urls:
                    urls[url]['tags'].append(tag_text)
                else:
                    tags = [tag_text] if tag_text != 'Unlabeled' else []

                    # get extended description
                    has_extended = (link.parent.nextSibling and
                            link.parent.nextSibling.name == 'dd')
                    if has_extended:
                        extended = link.parent.nextSibling.text
                    else:
                        extended = ""

                    # date the site was bookmarked
                    timestamp_added = float(link['add_date']) / 1e6

                    urls[url] = {
                        'description': link.text,
                        'tags': tags,
                        'extended': extended,
                        'date_added': datetime.fromtimestamp(timestamp_added),
                    }

        # save the bookmark
        for url, metadata in urls.items():
            self.save_bookmark(url,
                               metadata['description'],
                               metadata['extended'],
                               " ".join(metadata['tags']),
                               dt=metadata['date_added'],
                               fulltext=fulltext)