fix_bare_links.py | searchcode

/scripts/fix_bare_links.py

https://github.com/wangmxf/lesswrong
Python | 81 lines | 57 code | 18 blank | 6 comment | 14 complexity | 3209d7d19b08bc9f8c7e4c69f93ac8e9 MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-2.1

import re
import sys
import codecs

bare_link_re = re.compile(r'([^-A-Z0-9+&@#/%?=~_|!:,.;">]|^)(/lw/[^/]+/[^/]+/)([^-A-Z0-9+&@#/%?=~_|!:;"<]|$)')
linked_bare_link_re = re.compile(r'(<a href="[^"]+">)(/lw/[^/]+/[^/]+[-A-Z0-9+&@#/%?=~_|!:,.;]*[-A-Z0-9+&@#/%=~_|]</a>)')
spaces_around_anchor_re = re.compile(r'<a href\s*=\s+"?([^">]+)"?\s*>', re.IGNORECASE)
no_quotes_on_anchor_re = re.compile(r'<a href=([^>\'"]+)>([^<]+)</a>', re.IGNORECASE)
single_quotes_on_anchor_re = re.compile(r'<a href=\'([^>]+)\'>')
well_formed_uppercase_re = re.compile(r'<A [Hh][Rr][Ee][Ff]="([^"]+)">([^<]+)</A>')

def sub_group_1(match):
    return "<a href=\"%s\">" % match.group(1)

def sub_with_end_tag(match):
    return "<a href=\"%s\">%s</a>" % (match.group(1), match.group(2))

def wrap_bare_link(match):
    return '%s<a href="%s">http://lesswrong.com%s</a>%s' % (match.group(1), match.group(2), match.group(2), match.group(3))

def add_host_to_linked_bare_link(match):
    return match.group(1) + 'http://lesswrong.com' + match.group(2)

def rewrite_bare_links(content):
    # Tidy up strange HTML first
    content = spaces_around_anchor_re.sub(sub_group_1, content)
    content = no_quotes_on_anchor_re.sub(sub_with_end_tag, content)
    content = single_quotes_on_anchor_re.sub(sub_group_1, content)
    content = well_formed_uppercase_re.sub(sub_with_end_tag, content)
    
    # Fix bare links
    content = bare_link_re.sub(wrap_bare_link, content)
    content = linked_bare_link_re.sub(add_host_to_linked_bare_link, content)

    return content

def fix_bare_links(apply=False):
    from r2.models import Comment
    from r2.lib.db.thing import NotFound
    
    fbefore = codecs.open('fix_bare_links_before.txt', 'w', 'utf-8')
    fafter  = codecs.open('fix_bare_links_after.txt', 'w', 'utf-8')
    
    comment_id = 1
    try:
        # The comments are retrieved like this to prevent the API from 
        # attempting to load all comments at once and then iterating over them
        while True:
            comment = Comment._byID(comment_id, data=True)
        
            if (hasattr(comment, 'ob_imported') and comment.ob_imported) and (hasattr(comment, 'is_html') and comment.is_html):
                body = comment.body
                if isinstance(body, str):
                    try:
                        body = body.decode('utf-8')
                    except UnicodeDecodeError:
                        print >>sys.stderr, "UnicodeDecodeError, using 'ignore' error mode, comment: %d" % comment._id
                        body = body.decode('utf-8', errors='ignore')
                new_content = rewrite_bare_links(body)
                
                if new_content != body:
                    print >>fbefore, body
                    print >>fafter, new_content
                    
                    if apply:
                        comment.body = new_content
                        comment._commit()
                    
                    try:
                        print >>sys.stderr, "Rewrote comment %s" % comment.make_permalink_slow().encode('utf-8')
                    except UnicodeError:
                        print >>sys.stderr, "Rewrote comment with id: %d" % comment._id
                    
            
            comment_id += 1
    except NotFound:
        # Assumes that comment ids are sequential and never deleted
        # (which I believe to true) -- wjm
        print >>sys.stderr, "Comment %d not found, exiting" % comment_id

    return