100+ results for 'beautifulsoup'

Not the results you expected?

examples.py (https://github.com/towerjoo/django-test-extensions.git) Python · 112 lines

102

103 def test_using_beautiful_soup(self):

104 "Example test for content on a given view, this time using the BeautifulSoup parser"

105 response = self.client.get('/example/')

106 soup = BeautifulSoup(response.content)

feed.py (https://bitbucket.org/milos07p/pypsd-nao-on-git.git) Python · 194 lines

5 import xpath

6 from BaseHTTPServer import BaseHTTPRequestHandler

7 from BeautifulSoup import BeautifulSoup

8 from decimal import Decimal

9 from StringIO import StringIO

91

92 def get_soup(self):

93 return BeautifulSoup(self._html, convertEntities=BeautifulSoup.HTML_ENTITIES)

94

95 def get_json(value):

forms.py (https://github.com/stamen/fieldpapers.git) Python · 142 lines

6 import json

7

8 from BeautifulSoup import BeautifulSoup

9 from apiutils import finish_form, fail_form

10

21 """

22 page = urlopen(url)

23 soup = BeautifulSoup(page)

24 form = soup.form

25

thingSpider.py (https://github.com/enjrolas/Makerbot-Vending-Machine.git) Python · 138 lines

2 import urllib2

3 import re

4 from BeautifulSoup import BeautifulSoup, SoupStrainer, NavigableString

5 import os, errno

6 from urlparse import urlparse

40 # f=open("11816.html",'r')

41 page=f.read()

42 soup=BeautifulSoup(page)

43 hasStlFiles=False

44

get_manmankan_images.py (https://github.com/mitnk/stuff.git) Python · 143 lines

11 import time

12

13 from BeautifulSoup import BeautifulSoup

14

15 def save_image_list_to_cache(dir_name, image_list):

32 def get_chapter_list(url):

33 page = urllib2.urlopen(url)

34 soup = BeautifulSoup(page, fromEncoding="gb18030")

35 print u"Reading information of %s ..." % soup.findAll("h1")[0].string

36

57 def get_image_list(url):

58 page = urllib2.urlopen(url)

59 soup = BeautifulSoup(page)

60 javascripts = soup.findAll(text=lambda text: text.parent.name == "script")

61 image_script = ""

Makefile (https://gitlab.com/lokiexinferis/vim-configs) Makefile · 82 lines

77 build/html2vimdoc: | build

78 virtualenv build/html2vimdoc

79 build/html2vimdoc/bin/pip install beautifulsoup coloredlogs==4.0 markdown

80

81 build/vim-tools: | build

response.py (https://github.com/yoyo2k/scrapy.git) Python · 84 lines

13 from w3lib import html

14

15 from scrapy.xlib.BeautifulSoup import BeautifulSoup

16 from scrapy.http import Response, HtmlResponse

17

Makefile (https://github.com/freebsd/freebsd-ports.git) Makefile · 90 lines

48 BTLNCK_DESC= Accelerate certain NaN evals via math/py-bottleneck

49 EXCEL_DESC= MS Excel I/O Add-ons

50 HTML5LIB_DESC= Parse HTML with www/py-html5lib and www/py-beautifulsoup

51 HTML_DESC= HTML Parsing/Generation Add-ons

52 JINJA2_DESC= Support conditional HTML formatting with devel/py-Jinja2

71 BOTO_RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}boto>0:devel/py-boto@${PY_FLAVOR}

72 BTLNCK_RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}bottleneck>=1.2.0:math/py-bottleneck@${PY_FLAVOR}

73 HTML5LIB_RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}beautifulsoup>=4.2.1:www/py-beautifulsoup@${PY_FLAVOR} \

74 ${PYTHON_PKGNAMEPREFIX}html5lib>0:www/py-html5lib@${PY_FLAVOR}

75 JINJA2_RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}Jinja2>0:devel/py-Jinja2@${PY_FLAVOR}

76 LXML_RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}beautifulsoup>=4.2.1:www/py-beautifulsoup@${PY_FLAVOR} \

77 ${PYTHON_PKGNAMEPREFIX}lxml>0:devel/py-lxml@${PY_FLAVOR}

78 MPL_RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}matplotlib>=2.0.0:math/py-matplotlib@${PY_FLAVOR}

ParseHtmlfromFile.py (https://github.com/PuercoPop/EleccionesPeru.git) Python · 85 lines

2 # -*- coding: utf-8

3

4 from BeautifulSoup import BeautifulSoup

5 import pdb

6

15 """

16

17 soup = BeautifulSoup( f_handle )

18 a = soup.findAll('tr',height="40")

19

22 T_Flag = False

23 for item in soup.findAll('tr'):

24 for item2 in BeautifulSoup(str(item)).findAll('span',{'class':'arial_contenido_negrita'}):

25 if T_Flag == True:

26 if item2.contents == []:

78 pass

79 #print item.contents

80 #b = BeautifulSoup.BeautifulSoup(str(a))

81 #c = BeautifulSoup.BeautifulSoup( str( b.find('td',align="left" ) ) )

settings.py (https://github.com/thenoviceoof/rooibos.git) Python · 64 lines

49

50 # the backend to use when parsing the JavaScript or Stylesheet files

51 PARSER = getattr(settings, 'COMPRESS_PARSER', 'compressor.parser.BeautifulSoupParser')

52

53 # Allows changing verbosity from the settings.

listing.py (https://github.com/4teamwork/ftw.pdfgenerator.git) Python · 178 lines

1 from BeautifulSoup import BeautifulSoup

2 from ftw.pdfgenerator.html2latex import subconverter

3 from ftw.pdfgenerator.utils import html2xmlentities

39 dom = minidom.parseString(html)

40 except ExpatError, exc:

41 # cleanup html with BeautifulSoup

42 html = str(BeautifulSoup(html))

HelpIndex.py (https://github.com/esitarski/CrossMgr.git) Python · 98 lines

8 import glob

9 import re

10 from bs4 import BeautifulSoup

11

12 htmlDocDir = 'CrossMgrHtmlDoc'

44 # Extract content sections from the html pages.

45 for f in glob.iglob( os.path.join(htmlDocDir, '*.html') ):

46 doc = BeautifulSoup( open(f).read(), 'html.parser' )

47 div = doc.find('div', class_='content')

48 if not div:

PYopLib.py (https://bitbucket.org/y0no/pyopmail.git) Python · 78 lines

3 import requests

4 from os.path import join

5 from bs4 import BeautifulSoup as bs4

6 from bs4 import Comment

7

testFunctional.py (https://github.com/eaudeweb/Naaya.git) Python · 151 lines

1 import re

2 from BeautifulSoup import BeautifulSoup

3

4 from Products.Naaya.tests.NaayaFunctionalTestCase import NaayaFunctionalTestCase

140 self.browser.go('http://localhost/portal/myfolder')

141 html = self.browser.get_html()

142 soup = BeautifulSoup(html)

143

144 tables = soup.findAll('table', id='folderfile_list')

legacy.py (https://github.com/jlongman/xbmc-hockeystreams-plugin.git) Python · 174 lines

2 from abstract import AbstractHockey

3

4 from BeautifulSoup import BeautifulSoup

5 import xbmcplugin, xbmcaddon, xbmcgui

6 import hs_rss

ieo.py (https://gitlab.com/rithvikvibhu/batch-sof) Python · 71 lines

2 import json

3 import pprint

4 from bs4 import BeautifulSoup

5

6 print "batch-sof: IEO\nAuthor: Rithvik\n-----------------\n"

46 r = requests.post("http://server1.sofworld.org/ieo-result/show.php", data = payload)

47

48 soup = BeautifulSoup(r.text, "html5lib") # Soup up html

49 table_data = [[cell.text for cell in row("td")]

50 for row in BeautifulSoup(r.text, "html5lib")("tr")]

imo.py (https://gitlab.com/rithvikvibhu/batch-sof) Python · 71 lines

2 import json

3 import pprint

4 from bs4 import BeautifulSoup

5

6 print "batch-sof: IMO\nAuthor: Rithvik\n-----------------\n"

46 r = requests.post("http://server1.sofworld.org/imo-result/show.php", data = payload)

47

48 soup = BeautifulSoup(r.text, "html5lib") # Soup up html

49 table_data = [[cell.text for cell in row("td")]

50 for row in BeautifulSoup(r.text, "html5lib")("tr")]

run.py (https://bitbucket.org/skywalking/loginparttimesystem.git) Python · 118 lines

3

4 import cookielib, optparse, setting, urllib, urllib2, sys

5 from BeautifulSoup import BeautifulSoup

6 from datetime import datetime

7 from time import sleep

27 def parse_signin(content, project = 1):

28 info = ()

29 bs = BeautifulSoup(content).findAll('tr')[project]

30 v = bs.findAll('td')

31 k = bs.find('input', {'name': 'signin'})

34

35 def parse_signout(content):

36 bs = BeautifulSoup(content).find('div', {'id': 'body'})

37 if bs.text == '您沒有簽到記錄,無法進行簽退 ....':

38 return 0

wiki_semantic.py (https://github.com/japerk/geopy.git) Python · 108 lines

6

7 try:

8 from BeautifulSoup import BeautifulSoup

9 except ImportError:

10 util.logger.warn("BeautifulSoup was not found. " \

30 def parse_rdf_link(self, page, mime_type='application/rdf+xml'):

31 """Parse the URL of the RDF link from the <head> of ``page``."""

32 soup = BeautifulSoup(page)

33 link = soup.head.find('link', rel='alternate', type=mime_type)

34 return link and link['href'] or None

80 util.logger.debug("Fetching %s..." % url)

81 page = urlopen(url)

82 soup = BeautifulSoup(page)

83

84 rdf_url = self.parse_rdf_link(soup)

toc.py (https://gitlab.com/janninematt/janninematt) Python · 145 lines

11 import re

12

13 from bs4 import BeautifulSoup, Comment

14

15 from pelican import contents, signals

118 title = content.metadata.get('title', 'Title')

119 tree = node = HtmlTreeNode(None, title, 'h0', '')

120 soup = BeautifulSoup(content._content, 'html.parser')

121 settoc = False

122

136 if (settoc):

137 tree_string = '{}'.format(tree)

138 tree_soup = BeautifulSoup(tree_string, 'html.parser')

139 content.toc = tree_soup.decode(formatter='html')

140 content._content = soup.decode(formatter='html')

main.py (https://gitlab.com/smidaharoun/devoirTunisiePython) Python · 197 lines

2 import urllib2

3

4 from bs4 import BeautifulSoup

5 from flask import Flask, jsonify

6 from flask import request

12 main = "https://www.devoir.tn/"

13 page = urllib2.urlopen(main)

14 soup = BeautifulSoup(page, 'html.parser')

15 soup.prettify()

16

67 url_level = main + request_link

68 page_level = urllib2.urlopen(url_level)

69 soup_level = BeautifulSoup(page_level, 'html.parser')

70 soup_level.prettify()

71 table = soup_level.find_all("div", {'class': re.compile(r'card br-1 bgb-amber.*')})

requirements_txt_linker_spec.rb (https://gitlab.com/wendy0402/gitlab-ce) Ruby · 95 lines

27 nose

28 nose-cov

29 beautifulsoup4

30 #

31 ###### Requirements with Version Specifiers ######

69 expect(subject).to include(link('nose', 'https://pypi.python.org/pypi/nose'))

70 expect(subject).to include(link('nose-cov', 'https://pypi.python.org/pypi/nose-cov'))

71 expect(subject).to include(link('beautifulsoup4', 'https://pypi.python.org/pypi/beautifulsoup4'))

72 expect(subject).to include(link('docopt', 'https://pypi.python.org/pypi/docopt'))

73 expect(subject).to include(link('keyring', 'https://pypi.python.org/pypi/keyring'))

index.rst (https://bitbucket.org/edhaker13/flexget.git) ReStructuredText · 128 lines

15

16 * SQLAlchemy

17 * BeautifulSoup

18 * Feedparser

19 * Python-Requests

main_redis.py (https://github.com/jackfrued/Python-100-Days.git) Python · 156 lines

11 import redis

12 import requests

13 from bs4 import BeautifulSoup

14 from bson import Binary

15

71

72 def parse(self, html_page, *, domain='m.sohu.com'):

73 soup = BeautifulSoup(html_page, 'lxml')

74 for a_tag in soup.body.select('a[href]'):

75 parser = urlparse(a_tag.attrs['href'])

setup.py (https://github.com/AnneGilles/deform.git) Python · 78 lines

56 include_package_data=True,

57 zip_safe=False,

58 tests_require=requires + ['BeautifulSoup'],

59 install_requires=requires,

60 test_suite="deform",

crawl.py (https://github.com/tarunrs/osu-events-server.git) Python · 70 lines

3 import time

4 from datetime import date, timedelta, datetime

5 from BeautifulSoup import BeautifulSoup, NavigableString, Tag

6 from Events import OSUEvents, Categories, Locations, Event_Types

7 from sqlalchemy.orm import sessionmaker

40 page_url = 'http://www.osu.edu/events/indexDay.php?Event_ID=&Date=' + str_date

41 html_doc = urllib2.urlopen(page_url).read()

42 soup = BeautifulSoup(html_doc)

43 events = soup.table.contents[3].td.findAll("p")

44 for e in events:

getAWSdocs.py (https://github.com/richarvey/getAWSdocs.git) Python · 158 lines

1 #!/usr/bin/env python3

2

3 from bs4 import BeautifulSoup

4 import os, argparse

5 from urllib.parse import urlparse, urlsplit

19 html_page = urlopen(start_page)

20 # Parse the HTML page

21 soup = BeautifulSoup(html_page, 'html.parser')

22 pdfs = set()

23 print("Generating PDF list (this may take some time)")

38 def find_pdfs_in_html(url):

39 html_page_doc = urlopen(url)

40 soup_doc = BeautifulSoup(html_page_doc, 'html.parser')

41 # Get the A tag from the parsed page

42 pdfs = set()

helper.py (https://bitbucket.org/macdylan/lbforum.git) Python · 44 lines

1 #!/usr/bin/env python

2 # -*- coding: UTF-8 -*-

3 from BeautifulSoup import BeautifulSoup, NavigableString

4 from django.conf import settings

5

30

31 def clean_html( fragment ):

32 soup = BeautifulSoup( fragment.strip() )

33 def cleanup( soup ):

34 for tag in soup:

README.rst (https://github.com/liberation/django_compressor.git) ReStructuredText · 71 lines

35 Django Compressor is highly configurable and extendible. The HTML parsing

36 is done using lxml_ or if it's not available Python's built-in HTMLParser by

37 default. As an alternative Django Compressor provides a BeautifulSoup_ and a

38 html5lib_ based parser, as well as an abstract base class that makes it easy to

39 write a custom parser.

57 ``pip install django_compressor==dev`` or ``easy_install django_compressor==dev``.

58

59 .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/

60 .. _lxml: http://lxml.de/

61 .. _html5lib: http://code.google.com/p/html5lib/

BeautifulSupe.py (https://github.com/mbologna/BitFountain.git) Python · 132 lines

1 # A very very minimal BeautifulSoup immitation.

2 #

3 # BS uses SGMLlib to parse, which converts everything to lower case.

independent.py (https://gitlab.com/harrigan/TPP) Python · 36 lines

4 import re

5 from crimespider.items import CrimeItem

6 from bs4 import BeautifulSoup

7

8

29 for c in response.css("div.ctx_content"):

30 article += c.extract()

31 s = BeautifulSoup(article, 'lxml')

32 print( s.get_text() )

33 print( "\n" )

single.py (https://gitlab.com/skororu/pysnippets) Python · 67 lines

7 import random # randint

8

9 import bs4 # BeautifulSoup

10 import requests # codes.ok, get

11

28 # access the permanent URL for the selected cartoon

29 req = requests.get(url)

30 page = bs4.BeautifulSoup(req.text, 'lxml')

31

32 # obtain cartoon strip title from the ctitle block

44 # so our random selection has an upper bound

45 with requests.get(base_url) as req:

46 page = bs4.BeautifulSoup(req.text, 'lxml')

47 num_previous = page.find('a', rel='prev')['href']

48 upper_limit = int(num_previous[1:-1]) + 1

parseDelscrHTM-1.py (https://github.com/gtani7/pyrb--python-scrape-spider.git) Python · 61 lines

9 # url_re=re.compile(r'<a href="(.*)" onmousedown=')

10 def __init__(self,mag_file): # mag_file is string, not file obj

11 from BeautifulSoup import BeautifulSoup

12 soup = BeautifulSoup(mag_file)

25

26 def __init__(self,red_file):

27 from BeautifulSoup import BeautifulSoup

28 soup = BeautifulSoup(red_file)

geoserver.py (https://github.com/dotskapes/dotSkapes.git) Python · 104 lines

1 from urllib import urlencode

2 from urllib2 import urlopen

3 from BeautifulSoup import BeautifulStoneSoup

4

5 db.define_table ('geoserver_sources',

version_check.py (https://gitlab.com/mimizone/kolla) Python · 126 lines

18 import sys

19

20 from bs4 import BeautifulSoup as bs

21 from oslo_config import cfg

22 import pkg_resources

hupu_redis.py (https://github.com/rieuse/learnPython.git) Python · 67 lines

6 import redis

7 import threadpool

8 from bs4 import BeautifulSoup

9 import requests

10 from lxml import etree

voc_det_generator.py (https://github.com/donnyyou/torchcv.git) Python · 138 lines

9 import argparse

10 import shutil

11 from bs4 import BeautifulSoup

12

13

49 xml_tree = file_stream.readlines()

50 xml_tree = ''.join([line.strip('\t') for line in xml_tree])

51 xml_tree = BeautifulSoup(xml_tree, "html5lib")

52 for obj in xml_tree.findAll('object'):

53 object = dict()

catalogparser.py (https://github.com/jeffh/YACS.git) Python · 106 lines

1 import urllib2

2 import re

3 from BeautifulSoup import BeautifulSoup

4 from rpi_courses.config import DEPARTMENTS

5

39 def get_course_detail(course_page):

40 course_page = re.sub('<br */?>', '\n', course_page)

41 soup = BeautifulSoup(course_page, convertEntities=BeautifulSoup.HTML_ENTITIES)

42 title_text = soup.findAll('h1 h2 h3 h4 h5 h6'.split(' '))[0].text

43 title = re.search('([\w+\s]+) (\d+\w+) \- (.*)', title_text)

scrape.py (https://github.com/sneeu/aliss_scrapers.git) Python · 113 lines

6 import urllib2

7

8 from BeautifulSoup import BeautifulSoup

9

10 from soupselect import select as css

31 html = html.replace('<!- Google Analytics -->', '')

32 html = re.sub('<script.*?>[\s\S]*?</.*?script>', '', html)

33 soup = BeautifulSoup(html)

34

35 item = {}

zad_7.py (https://gitlab.com/mmeisel/LV) Python · 43 lines

7 import re

8 import urllib

9 from bs4 import BeautifulSoup

10

11 def ripDomain(): #funkcija koja "cupa" domenu iz linka

25 urlAddr=correctURL() #ispravlja se url ukoliko je potrebno

26 html=urllib.urlopen(urlAddr, "lxml").read() #otvara se url

27 soup=BeautifulSoup(html) #i deklarira objekt tipa BeautifulSoup

28

29 domain=ripDomain() #trazi se domena

scraping-the-web.rst (https://github.com/EnTeQuAk/pydanny-event-notes.git) ReStructuredText · 109 lines

43 ===========

44

45 * BeautifulSoup is old and not maintained anymore

46 * html5lib

47 - builds BeautifulSoup objects

test-broken-html.py (https://github.com/JonathanRRogers/twill.git) Python · 181 lines

24

25 commands.config('use_tidy', '0')

26 commands.config('use_BeautifulSoup', '0')

27 commands.config('allow_parse_errors', '0')

28

60

61 commands.config('use_tidy', '1')

62 commands.config('use_BeautifulSoup', '0')

63 commands.config('allow_parse_errors', '0')

64

89 # pass

90

91 def test_BeautifulSoup():

92 """

93 test parsing of BS-processed HTML.

conversation.py (https://gitlab.com/sanchezfauste/TweetDigraph) Python · 108 lines

1 from bs4 import BeautifulSoup

2 import requests

3

94 req = requests.get(url)

95 if req.status_code == 200:

96 html = BeautifulSoup(req.text, 'html.parser')

97 conversations = html.find_all('li', {'class':'ThreadedConversation'})

98 conversations += html.find_all('div', \

burp-to-sqlmap.py (https://github.com/Miladkhoshdel/burp-to-sqlmap.git) Python · 191 lines

2 import sys

3 import os

4 from bs4 import BeautifulSoup

5 import os.path

6 import argparse

103 print(" [+] Exporting Packets ...")

104 with open(filename, 'r') as f:

105 soup = BeautifulSoup(f.read(), "html.parser")

106 for i in soup.find_all("request"):

107 packetnumber = packetnumber + 1

143

144 with open(filename, 'r') as f:

145 soup = BeautifulSoup(f.read(), "html.parser")

146 for i in soup.find_all("request"):

147 packetnumber = packetnumber + 1

bills.py (https://github.com/runderwood/openstates.git) Python · 138 lines

4 import datetime as dt

5 import urllib2

6 from BeautifulSoup import BeautifulSoup

7

8 from fiftystates.scrape.bills import BillScraper, Bill

11 '''Remove some irregularities from WV's HTML.

12

13 It includes a spurious </HEAD> before the useful data begins and lines like '<option value="Bill"selected="selected">Bill</option>', in which the lack of a space between the attributes confuses BeautifulSoup.

14 '''

15 data = data.replace('</HEAD>', '')

68 if not sessionexisted(data):

69 return False

70 soup = BeautifulSoup(cleansource(data))

71 rows = soup.findAll('table')[1].findAll('tr')[1:]

72 for row in rows:

testFunctional.py (https://github.com/eaudeweb/Naaya.git) Python · 162 lines

1 import re

2 from unittest import TestSuite, makeSuite

3 from BeautifulSoup import BeautifulSoup

4

5 from Products.Naaya.tests.NaayaFunctionalTestCase import NaayaFunctionalTestCase

146 self.browser.go('http://localhost/portal/myfolder')

147 html = self.browser.get_html()

148 soup = BeautifulSoup(html)

149

150 tables = soup.findAll('table', id='folderfile_list')

hdtrailers.py (https://github.com/ryanrdetzel/CouchPotato.git) Python · 128 lines

1 from app.config.cplog import CPLog

2 from app.lib.provider.rss import rss

3 from imdb.parser.http.bsouplxml._bsoup import SoupStrainer, BeautifulSoup

4 from string import letters, digits

5 from urllib import urlencode

68 try:

69 tables = SoupStrainer('div')

70 html = BeautifulSoup(data, parseOnlyThese = tables)

71 resultTable = html.findAll('h2', text = re.compile(movie))

72

92 try:

93 tables = SoupStrainer('table')

94 html = BeautifulSoup(data, parseOnlyThese = tables)

95 resultTable = html.find('table', attrs = {'class':'bottomTable'})

96

pingback.py (https://github.com/sumegha/django-gstudio.git) Python · 140 lines

15 from gstudio.models import Nodetype

16 from gstudio.settings import PINGBACK_CONTENT_LENGTH

17 from BeautifulSoup import BeautifulSoup

18 from django_xmlrpc.decorators import xmlrpc_func

19

92 return TARGET_IS_NOT_PINGABLE

93

94 soup = BeautifulSoup(document)

95 title = soup.find('title')

96 title = title and strip_tags(title) or _('No title')

__init__.py (https://github.com/palli81/headphones.git) Python · 96 lines

39

40 treeType - the name of the tree type required (case-insensitive). Supported

41 values are "simpletree", "dom", "etree" and "beautifulsoup"

42

43 "simpletree" - a built-in DOM-ish tree type with support for some

50 elementtree-like interface (known to work with

51 ElementTree, cElementTree and lxml.etree).

52 "beautifulsoup" - Beautiful soup (if installed)

53

54 implementation - (Currently applies to the "etree" and "dom" tree types). A

69 import simpletree

70 treeBuilderCache[treeType] = simpletree.TreeBuilder

71 elif treeType == "beautifulsoup":

72 import soup

73 treeBuilderCache[treeType] = soup.TreeBuilder

shorter.py (https://bitbucket.org/badc0re/xsser_gsoc.git) Python · 76 lines

26 import pycurl

27 from cStringIO import StringIO

28 from BeautifulSoup import BeautifulSoup

29

30 class ShortURLReservations(object):

64 c.close()

65

66 soup = BeautifulSoup(out.getvalue())

67 if self._service == 'tinyurl':

68 return soup.findAll('blockquote')[1].findAll('a')[0]['href']

vt_hash2filenames.py (https://bitbucket.org/Vnoxygen/malformity.git) Python · 43 lines

2

3 import re

4 from BeautifulSoup import BeautifulSoup

5 from canari.maltego.utils import debug, progress

6 from canari.framework import configure

bootstrap.py (https://github.com/Huawei/containerops.git) Python · 185 lines

5 import sys

6 import glob

7 from bs4 import BeautifulSoup

8 import json

9 import anymarkup

90 with open(os.path.join(root, file_name), 'r') as f:

91 data = f.read()

92 soup = BeautifulSoup(data, 'html.parser')

93 title = soup.find('title').text

94 body = soup.find('body').renderContents()

utils.py (https://github.com/Gautier/django-page-cms.git) Python · 139 lines

118 if content_type in ('title', 'slug'):

119 return content

120 from BeautifulSoup import BeautifulSoup

121 tree = BeautifulSoup(content)

release.py (https://gitlab.com/LocutusOfPenguin/python-chess) Python · 178 lines

140 print(res)

141 sys.exit(1)

142 soup = bs4.BeautifulSoup(res.text, "html.parser")

143 csrf = soup.find("input", {"name": "CSRFToken"})["value"]

144 print("CSRF: {0}".format(csrf))

pyurllib.py (https://github.com/TsingJyujing/DataSpider.git) Python · 196 lines

11 import threading

12

13 from bs4 import BeautifulSoup

14

15 from tsing_spider.config import (

47 :return:

48 """

49 return BeautifulSoup(http_get(url), get_xml_decoder()) # html.parser

50

51

182 class LazySoup(LazyContent):

183 """

184 Lazy-loaded URL resource, and parse by BeautifulSoup

185 """

186

transforms.py (https://github.com/giacomos/jarn.xmpp.core.git) Python · 94 lines

3 import urllib2

4 from urlparse import urlparse

5 from BeautifulSoup import BeautifulSoup

6

7 from plone.memoize import ram

27 return None

28 try:

29 doc = BeautifulSoup(urllib2.urlopen(url).read())

30 except UnicodeEncodeError: # This is for links to files/images.

31 doc = BeautifulSoup('')

types.py (https://github.com/rxuriguera/bibtexIndexMaker.git) Python · 87 lines

36 Parse error in search results.

37 self.msg attribute contains explanation why parsing failed

38 self.tag attribute contains BeautifulSoup object with the most relevant tag

39 that failed to parse

40 Thrown only in debug mode

importer.py (https://github.com/gregmalcolm/Bookie.git) Python · 201 lines

1 """Importers for bookmarks"""

2 from datetime import datetime

3 from BeautifulSoup import BeautifulSoup

4 from bookie.models import BmarkMgr

5

77 delicious_doctype = "DOCTYPE NETSCAPE-Bookmark-file-1"

78

79 soup = BeautifulSoup(file_io)

80 can_handle = False

81 can_handle = DelImporter._is_delicious_format(soup,

89 def process(self, fulltext=None):

90 """Given a file, process it"""

91 soup = BeautifulSoup(self.file_handle)

92

93 for tag in soup.findAll('dt'):

test_microformats.py (https://bitbucket.org/inirudebwoy/gdziebylkaziu.git) Python · 155 lines

4 from geopy.parsers.html import GeoMicroformat

5 try:

6 from BeautifulSoup import BeautifulSoup

7 except ImportError:

8 BeautifulSoup = None

24

25 def test_one_soup(self):

26 if BeautifulSoup:

27 locations = self.parser.find_all(BeautifulSoup(self.MARKUP))

30

31 def test_multi_soup(self):

32 if BeautifulSoup:

33 locations = self.parser.find_all(BeautifulSoup(self.MARKUP * 3))

49

50 def test_none_soup(self):

51 if BeautifulSoup:

52 locations = self.parser.find_all(BeautifulSoup(self.MARKUP))

share_post.py (https://gitlab.com/janninematt/janninematt) Python · 81 lines

7 """

8

9 from bs4 import BeautifulSoup

10 try:

11 from urllib.parse import quote

17

18 def article_title(content):

19 main_title = BeautifulSoup(content.title, 'html.parser').get_text().strip()

20 sub_title = ''

21 if hasattr(content, 'subtitle'):

22 sub_title = ' ' + BeautifulSoup(content.subtitle, 'html.parser').get_text().strip()

23 return quote(('%s%s' % (main_title, sub_title)).encode('utf-8'))

24

layouttestresults.py (https://gitlab.com/x33n/phantomjs) Python · 91 lines

30

31 from webkitpy.common.net.resultsjsonparser import ResultsJSONParser

32 from webkitpy.thirdparty.BeautifulSoup import BeautifulSoup, SoupStrainer

33 from webkitpy.layout_tests.models import test_results

34 from webkitpy.layout_tests.models import test_failures

setup.py (https://github.com/eged/django-blog-zinnia.git) Python · 37 lines

32 'django-mptt',

33 'akismet',

34 'BeautifulSoup',

35 ])

36

test_html.py (https://github.com/openhatch/oh-mainline.git) Python · 151 lines

26 list(br.links())

27

28 def test_robust_form_parser_uses_beautifulsoup(self):

29 factory = mechanize.RobustFormsFactory()

30 self.assertIs(factory.form_parser_class,

31 mechanize._form.RobustFormParser)

32

33 def test_form_parser_does_not_use_beautifulsoup(self):

34 factory = mechanize.FormsFactory()

35 self.assertIs(factory.form_parser_class, mechanize._form.FormParser)

baidu_spider_progress.py (https://github.com/kkyon/botflow.git) Python · 123 lines

1 from botflow import Pipe, Branch, Timer

2 from botflow import BotFlow

3 from bs4 import BeautifulSoup

4 from dataclasses import dataclass

5 from botflow.ex.http import HttpLoader

27 # 解析具体条目

28 def get_all_items(response):

29 soup = BeautifulSoup(response.text, "lxml")

30 items = soup.select('div.result.c-container')

31 result = []

46 #BD_URL='https://180.97.33.108' #

47 BD_URL='https://www.baidu.com'

48 soup = BeautifulSoup(response.text, "lxml")

49 page = soup.select('div#page')

50 for item in page[0].find_all('a'):

Python爬取日记八:斗鱼弹幕相关信息保存到mongodb.py (https://github.com/rieuse/learnPython.git) Python · 94 lines

11 import pymongo

12 import requests

13 from bs4 import BeautifulSoup

14

15 clients = pymongo.MongoClient('localhost')

83 def get_name(roomid):

84 r = requests.get("http://www.douyu.com/" + roomid)

85 soup = BeautifulSoup(r.text, 'lxml')

86 return soup.find('a', {'class', 'zb-name'}).string

87

feats.py (https://github.com/devonjones/PSRD-Parser.git) Python · 153 lines

2 import json

3 import re

4 from BeautifulSoup import BeautifulSoup

5 from psrd.rules import write_rules

6 from psrd.files import char_replace

85 p = find_section(feat, name="Prerequisites", section_type='section')

86 if p != None:

87 soup = BeautifulSoup(p['text'])

88 p['description'] = ''.join(soup.findAll(text=True))

89 del p['text']

117 feat['sections'].remove(section)

118 if feat.has_key('text') and not feat.has_key('description'):

119 soup = BeautifulSoup(feat['text'])

120 feat['description'] = ''.join(soup.findAll(text=True))

121 del feat['text']

plugin.py (https://github.com/gsf/supybot-plugins.git) Python · 47 lines

8 import supybot.callbacks as callbacks

9

10 from BeautifulSoup import BeautifulSoup

11

12 def lookup(word):

22 return 'http error %s for %s' % (e.code, url)

23

24 soup = BeautifulSoup(doc)

25 dd = soup.find('dd', 'highlight')

26

download_russian_contrast.py (https://bitbucket.org/Meister17/term-extraction.git) Python · 108 lines

1 #!/usr/bin/env python

2 # -*- coding: utf-8 -*-

3 from BeautifulSoup import BeautifulSoup

4 import optparse

5 import os

37 raise Exception('Wrong number of words')

38 html = response.read()

39 soup = BeautifulSoup(html)

40 zero_result = False;

41 for p in soup.findAll('p'):

refreshportlet.py (https://github.com/plone/plone.app.kss.git) Python · 64 lines

1 from zope.deprecation import deprecate

2

3 from kss.core.BeautifulSoup import BeautifulSoup

4 from kss.core import CommandSet

5 from plone.app.portlets.utils import assignment_from_key

26 # So we just select the <dl> for insertion.

27 # This could be spared with smarter templating.

28 soup = BeautifulSoup(portlet_body)

29 tag = soup.find('dl', id=nodeid)

30 result = unicode(tag)

middleware.py (https://github.com/ralphbean/raptorizemw.git) Python · 151 lines

1

2 import BeautifulSoup

3 import datetime

4 import random

102 """

103

104 soup = BeautifulSoup.BeautifulSoup(resp.body)

105

106 if not soup.html:

108

109 if not soup.html.head:

110 soup.html.insert(0, BeautifulSoup.Tag(soup, "head"))

111

112 prefix = self.resources_app.prefix

113 js_helper = BeautifulSoup.Tag(

114 soup, "script", attrs=[

115 ('type', 'text/javascript'),

utils.py (https://github.com/nielssprong/lernanta.git) Python · 116 lines

8

9

10 from BeautifulSoup import BeautifulSoup

11

12 from django.conf import settings

69 both.

70 """

71 soup = BeautifulSoup(content)

72 links = soup.findAll('link')

73

74 # BeautifulSoup instances are not actually dictionaries, so

75 # we can't use the more proper 'key in dict' syntax and

76 # must instead use the deprecated 'has_key()' method.

Makefile (https://github.com/freebsd/freebsd-ports.git) Makefile · 34 lines

16 ${PYTHON_PKGNAMEPREFIX}keyring>0:security/py-keyring@${PY_FLAVOR} \

17 ${PYTHON_PKGNAMEPREFIX}psutil>=2.0:sysutils/py-psutil@${PY_FLAVOR}

18 RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}beautifulsoup>=4.2.1:www/py-beautifulsoup@${PY_FLAVOR} \

19 ${PYTHON_PKGNAMEPREFIX}importlib-metadata>0:devel/py-importlib-metadata@${PY_FLAVOR} \

20 ${PYTHON_PKGNAMEPREFIX}keyring>0:security/py-keyring@${PY_FLAVOR} \

pingback.py (https://github.com/aparo/django-blog-zinnia.git) Python · 141 lines

16 from zinnia.managers import PINGBACK

17 from zinnia.settings import PINGBACK_CONTENT_LENGTH

18 from BeautifulSoup import BeautifulSoup

19 from django_xmlrpc.decorators import xmlrpc_func

20

93 return TARGET_IS_NOT_PINGABLE

94

95 soup = BeautifulSoup(document)

96 title = soup.find('title')

97 title = title and strip_tags(title) or _('No title')

shotchart_cbssports.py (https://github.com/kpascual/nbascrape.git) Python · 127 lines

5 import sys

6 import logging

7 from BeautifulSoup import BeautifulSoup

8 from libscrape.config import constants

9

search_opportunities.py (https://github.com/PacktPublishing/Python-Automation-Cookbook.git) Python · 150 lines

8 import delorean

9 import requests

10 from bs4 import BeautifulSoup

11 import mistune

12 import jinja2

61 # Get the article

62 response = requests.get(entry.link)

63 article = BeautifulSoup(response.text, 'html.parser')

64 article_reference = (article.title.string.strip(),

65 entry.summary.strip(),

CalendarBuilder.py (https://github.com/levyd/CalendarBuilder.git) Python · 70 lines

1 from sys import stderr

2 from BeautifulSoup import BeautifulSoup

3 from icalendar import Calendar, Event, vRecur

4 from datetime import datetime, timedelta

39 def parse(self, infile):

40 """Parses the dalonline HTML into a schedule for one week"""

41 doc = BeautifulSoup(infile.read())

42

43 # Get the base date (Monday) of the webpage's calendar

lx_simple.py (https://github.com/jabbalaci/jabbapylib.git) Python · 141 lines

53 </html>'''

54 doc = lx.to_doc(html)

55 print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

56

57 def demo4():

114 #doc = lx.to_doc(text, parser=scraper.HTML5PARSER)

115 #doc = lx.to_doc(text)

116 doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)

117 #print type(doc)

118 #print etree.tostring(doc)

extraer_datos_composicion_alimentos.py (https://gitlab.com/FoodUpProject/FoodUp) Python · 54 lines

1 # -*- coding: utf-8 -*-

2 import urllib2,unicodedata

3 from bs4 import BeautifulSoup

4

5 #método de análisis de una dirección web

6 def analisisDescarga(archivo,conexion):

7 html = conexion.read()

8 soup = BeautifulSoup(html)

9 #obtenemos una lista de String con la condición de atributos class con valores details y price

10 links = soup.find_all(True, {'align':['left','right']})

html.py (https://bitbucket.org/charlisim/search_url_web.git) Python · 101 lines

21 '''

22 import re

23 from bs4 import BeautifulSoup

24 from url import *

25

27 class html:

28 def __init__(self, content):

29 self.content = BeautifulSoup(content)

30

31

createspace-scraper.py (https://github.com/russx2/createspace-scraper.git) Python · 88 lines

2 import sys

3 import requests

4 from BeautifulSoup import BeautifulSoup

5

6 def get_sales(email, password, date_start, date_end):

53 r = session.post('https://www.createspace.com/pub/reports/ajax/table.salesdetails.do?sid=' + token + '&msk=mr')

54

55 markup = BeautifulSoup(r.content)

56 markupHeadingBlock = markup.find('tr', {'class': 'head2'})

57 totalQuantity = markupHeadingBlock.find(text = re.compile('\d+'))

get_legislation.py (https://github.com/gosuri/fiftystates.git) Python · 118 lines

3 import unicodedata;

4 import re

5 from BeautifulSoup import BeautifulSoup

6

7 # ugly hack

44 response = urllib2.urlopen(req)

45 doc = response.read()

46 soup = BeautifulSoup(doc)

47

48 #parse results

porn4days.py (https://github.com/alfa-addon/addon.git) Python · 147 lines

17 from core import servertools

18 from core import httptools

19 from bs4 import BeautifulSoup

20

21 host = 'http://porn4days.biz/'

87 if unescape:

88 data = scrapertools.unescape(data)

89 soup = BeautifulSoup(data, "html5lib", from_encoding="utf-8")

90 return soup

91

discord_insult_spam_dm.py (https://github.com/Merubokkusu/discord-spam-bots.git) Python · 114 lines

18 import os

19 import random

20 from bs4 import BeautifulSoup

21 sys.path.append("./.")

22 from config import *

54 file.close()

55 html = urllib.request.urlopen("https://insult.mattbas.org/api/insult.html").read()

56 soup = BeautifulSoup(html,"html.parser")

57 insult_text = soup.find('h1')

58 print(insult_text.text)

81 file.close()

82 html = urllib.request.urlopen("https://insult.mattbas.org/api/insult.html").read()

83 soup = BeautifulSoup(html,"html.parser")

84 insult_text = soup.find('h1')

85 print(insult_text.text)

lxmlselector.py (https://github.com/steeve/scrapy-lxmlselector.git) Python · 154 lines

2 Lxml selector

3 Provides both XPath and CSS Selection.

4 Can use html5lib and BeautifulSoup.

5

6 Provided by Steeve Morin <steeve.morin@gmail.com>

25

26 def __init__(self, response=None, text=None, node=None, parent=None, expr=None,

27 use_html5lib=False, use_BeautifulSoup=False, namespaces=None):

28 if parent:

29 self.doc = parent.doc

31 elif response:

32 self.xmlNode = self._lxml_parse_document(response.body, use_html5lib,

33 use_BeautifulSoup)

34 self.doc = self.xmlNode.getroottree()

35 elif text:

mgstage.py (https://github.com/yoshiko2/AV_Data_Capture.git) Python · 129 lines

4 from lxml import etree

5 import json

6 from bs4 import BeautifulSoup

7 from ADC_function import *

8 # import sys

99 number=number2.upper()

100 htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))

101 soup = BeautifulSoup(htmlcode, 'lxml')

102 a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')

103 b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')

browser.py (https://gitlab.com/phyks/weboob) Python · 120 lines

21 from weboob.deprecated.browser import Browser, BrowserIncorrectPassword

22 from weboob.deprecated.browser.parsers.iparser import IParser

23 import BeautifulSoup

24

25 from .pages import PagePrivateThreadsList, PagePrivateThread, PageLogin, PageIndex, DummyPage, PageUserProfile, PageCityList

31 class SoupParser(IParser):

32 def parse(self, data, encoding=None):

33 return BeautifulSoup.BeautifulSoup(data.read().decode(encoding or 'utf-8'), convertEntities=BeautifulSoup.BeautifulStoneSoup.ALL_ENTITIES)

34

35

pipelines.py (https://github.com/richshaw2015/oh-my-rss.git) Python · 142 lines

10 import django

11 import urllib

12 from bs4 import BeautifulSoup

13 import lxml.etree as etree

14 import re

40

41 def process_item(self, item, spider):

42 content_soup = BeautifulSoup(item['content'], "html.parser")

43

44 # to absolute external href

2009-2-6-my-macheist-release-estimate.markdown (https://github.com/FranklinChen/mattfoster.github.com.git) Markdown · 42 lines

14 <div class="thumbnail"><a href="http://skitch.com/mattfoster/bd2ut/macheist-mainframe"><img src="http://img.skitch.com/20090206-nd331ywttf11ypf684ehdr5scr.preview.jpg" alt="MacHeist: Mainframe" /></a><br /><span style="font-family: Lucida Grande, Trebuchet, sans-serif, Helvetica, Arial; font-size: 10px; color: #808080">Uploaded with <a href="http://plasq.com/">plasq</a>'s <a href="http://skitch.com">Skitch</a>!</span></div>

15

16 I used [BeautifulSoup](http://crummy.com/software/BeautifulSoup "Beautiful Soup: We called him Tortoise because he taught us.") with [urllib2](http://docs.python.org/library/urllib2.html "urllib2 — extensible library for opening URLs &mdash; Python v2.6.1 documentation") and a tiny regular expression, and here's the result:

17

18 <script src="http://gist.github.com/59370.js"></script>

congress.py (https://github.com/michaelmyers/python-congress.git) Python · 138 lines

10

11 from datetime import datetime

12 from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup

13 from openpyxl.workbook import Workbook

14 from openpyxl.writer.excel import ExcelWriter

123 url = 'http://clerk.house.gov/evs/' + str(year) + '/index.asp'

124 page = urllib2.urlopen(url)

125 soup = BeautifulSoup(page)

126 text = soup.find('a')

127

ny_times_pre_1981_scraper.py (https://github.com/slifty/rdiscraper.git) Python · 101 lines

1 # For processing HTML

2 from BeautifulSoup import BeautifulSoup

3

4 # Fetch the Times's Advanced Search results for 'the.' urllib allows passing of HTML data

42 # print html_data

43

44 # Have BeautifulSoup parse the HTML into a tree of objects we can use

45 soup = BeautifulSoup(html_data)

scrape.py (https://gitlab.com/mkhouri/news_scraper) Python · 72 lines

1 import re

2 from bs4 import BeautifulSoup

3 from urllib.parse import urlparse

4 import requests

6

7 def parse(url, pageHtml, bodyLines):

8 soup = BeautifulSoup(pageHtml, "lxml")

9 host = urlparse(url).hostname

10

ece301.py (https://github.com/kuruoujou/Course-Note-Grabber.git) Python · 174 lines

4 os.putenv("DISPLAY",":0.0")

5

6 from BeautifulSoup import BeautifulSoup

7 import re

8 from urllib2 import urlopen

34 #Downloads past and upcoming exam info...

35 page = urlopen(home)

36 soup = BeautifulSoup(page)

37 links = soup.findAll(href=re.compile('.*?\.pdf'))

38 eicount = 0

52 #Downloads Course Docs

53 page = urlopen(docs)

54 soup = BeautifulSoup(page)

55 links = soup.findAll(href=re.compile('.*?\.pdf|.*?\.html?'))

56 dcount = 0

ci.py (https://github.com/pwxcoo/chinese-xinhua.git) Python · 72 lines

8

9 import requests, csv

10 from bs4 import BeautifulSoup

11 import time

12 from multiprocessing.dummy import Pool as ThreadPool

25

26 print(f'{url} is parsing')

27 html = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")

28 a = html.find_all('a', target="_blank")

29

36 try:

37 response = requests.get(words[i])

38 wordhtml = BeautifulSoup(response.content.decode('gbk', errors='ignore').replace('<br/>', '\n').replace('<br>', '\n')\

39 , "lxml")

40 td = wordhtml.find_all('table')[5].find_all('td')

PostNewsController.java (https://github.com/fuyunwang/ChengFeng1.5.git) Java · 142 lines

1 package com.beautifulsoup.chengfeng.controller.community;

2

3 import com.beautifulsoup.chengfeng.common.ResponseResult;

4 import com.beautifulsoup.chengfeng.controller.vo.PostNewsDetailVo;

5 import com.beautifulsoup.chengfeng.controller.vo.PostNewsVo;

6 import com.beautifulsoup.chengfeng.controller.vo.PostReplyVo;

7 import com.beautifulsoup.chengfeng.controller.vo.PosterVo;

8 import com.beautifulsoup.chengfeng.pojo.Journalism;

9 import com.beautifulsoup.chengfeng.pojo.PostNews;

10 import com.beautifulsoup.chengfeng.service.PostNewsService;

11 import com.beautifulsoup.chengfeng.service.dto.PostNewsDto;

create-manual.py (git://github.com/residuum/PuRestJson.git) Python · 68 lines

6 import subprocess

7 import sys

8 from bs4 import BeautifulSoup

9

10 wikiDir = '/tmp/PuRestJson.wiki/'

37 # edit links to css, images and other pages.

38 htmlDoc = open(exportDir + htmlFile)

39 soup = BeautifulSoup(htmlDoc, 'lxml')

40 for s in soup.findAll('link'):

41 s.extract()

test_sitegen.py (https://gitlab.com/Ivy001/pants) Python · 213 lines

89 self.config = json.loads(CONFIG_JSON)

90 self.soups = {

91 'index': bs4.BeautifulSoup(INDEX_HTML),

92 'subdir/page1': bs4.BeautifulSoup(P1_HTML),

93 'subdir/page2': bs4.BeautifulSoup(P2_HTML),

94 'subdir/page2_no_toc': bs4.BeautifulSoup(P2_HTML),

html.py (https://github.com/feyin/lamson.git) Python · 180 lines

25 """

26

27 from BeautifulSoup import BeautifulSoup

28 import clevercss

29 from lamson import mail, view

93 """

94 Used mostly internally but helpful for testing, this takes the given HTML

95 and applies the configured CSS you've set. It returns a BeautifulSoup

96 object with all the style attributes set and nothing else changed.

97 """

98 doc = BeautifulSoup(html)

99 roots = {} # the roots rarely change, even though the paths do

100

get_kottke.py (https://github.com/wilson428/Robottke.git) Python · 97 lines

7 from datetime import datetime

8

9 from BeautifulSoup import BeautifulSoup

10

11 prefix = '../'

33 url = "http://kottke.org/" + syear + "/" + smonth + "/"

34 print "<-----------------------" + url

35 soup = BeautifulSoup(urllib2.urlopen(url))

36 for entry in soup.findAll('div', { "class" : "post" }):

37 try:

soup.py (https://gitlab.com/tlevine/dexy) Python · 132 lines

1 from bs4 import BeautifulSoup

2 from dexy.filter import DexyFilter

3 from dexy.utils import chdir

12 Add <script> tags or <link> tags to an HTML file's header.

13

14 Uses BeautifulSoup.

15 """

16 aliases = ['customize']

22

23 def process_text(self, input_text):

24 soup = BeautifulSoup(input_text)

25

26 for js in self.setting('scripts'):

41

42 _settings = {

43 'html-parser' : ("Name of html parser BeautifulSoup should use.", 'html.parser'),

44 'inline-images' : ("Whether to inline images using the data uri scheme.", True),

45 'inline-styles' : ("Whether to embed referenced CSS in the page header.", True)

institution_test_suite.py (https://github.com/adsabs/scripts-affiliation-disambiguation.git) Python · 187 lines

173 def format_results(results):

174

175 from BeautifulSoup import UnicodeDammit

176

177 new_results = []

test_lxml.py (https://github.com/openhatch/oh-mainline.git) Python · 91 lines

16

17 from bs4 import (

18 BeautifulSoup,

19 BeautifulStoneSoup,

20 )

Makefile (https://bitbucket.org/bendikro/deluge-yarss-plugin.git) Makefile · 130 lines

73 @echo "Build finished; now you can run "qcollectiongenerator" with the" \

74 ".qhcp project file in $(BUILDDIR)/qthelp, like this:"

75 @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BeautifulSoup.qhcp"

76 @echo "To view the help file:"

77 @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BeautifulSoup.qhc"

82 @echo "Build finished."

83 @echo "To view the help file:"

84 @echo "# mkdir -p $$HOME/.local/share/devhelp/BeautifulSoup"

85 @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BeautifulSoup"

ServantStats.py (https://bitbucket.org/TheMysteryofDoom/doom-utilityapp.git) Python · 105 lines

2 import re

3 from discord.ext import commands

4 try: # check if BeautifulSoup4 is installed

5 from bs4 import BeautifulSoup

32 url = "http://fategrandorder.wikia.com/wiki/"+searcharg

33 async with aiohttp.get(url) as response:

34 soup = BeautifulSoup(await response.text(), 'html.parser')

35 try:

36 base = soup.find("div", {"class": "ServantInfoStatsWrapper"})

102 bot.add_cog(ServantStats(bot))

103 else:

104 raise RuntimeError("You need to run `pip3 install beautifulsoup4`")

105

106

translate-update.py (https://github.com/telegram-zhCN/telegram-language-resources.git) Python · 65 lines

2 import os

3 import requests

4 from bs4 import BeautifulSoup

5

6 identification = os.environ.get('TRANSIFEX_USERNAME')

22 s = requests.session()

23 r = s.get('https://www.transifex.com/signin/')

24 soup = BeautifulSoup(r.text, 'html.parser')

25 csrftoken = soup.find('input', {'name': 'csrfmiddlewaretoken'})['value']

26

test_flask_get.py (https://github.com/fredrik-corneliusson/click-web.git) Python · 74 lines

1 import pytest

2 from bs4 import BeautifulSoup

3

4

70

71 def _get_form_ids(html):

72 soup = BeautifulSoup(html, 'html.parser')

73 form_ids = [elem['name'] for elem in soup.find_all(['input', 'select', 'textarea'])]

74 return form_ids

snippet.py (https://github.com/gistable/gistable.git) Python · 180 lines

7 from datetime import datetime

8 import urllib.request

9 from bs4 import BeautifulSoup

10 import re

11 import string

39 return None

40 the_page = response.read()

41 soup = BeautifulSoup( the_page )

42

43 return soup

statusserver.py (https://github.com/DooMLoRD/Xperia-2011-Official-Kernel-Sources.git) Python · 96 lines

33 # WebKit includes a built copy of BeautifulSoup in Scripts/webkitpy

34 # so this import should always succeed.

35 from .BeautifulSoup import BeautifulSoup

36

37 import urllib2

parser.py (https://github.com/vishnevskiy/bbcodepy.git) Python · 112 lines

107

108 if prettify:

109 from BeautifulSoup import BeautifulSoup

110 html = BeautifulSoup(html).prettify()

DNSDumpsterAPI.py (https://github.com/m0rtem/CloudFail.git) Python · 84 lines

10 import requests

11

12 from bs4 import BeautifulSoup

13

14

53

54 req = s.get(dnsdumpster_url)

55 soup = BeautifulSoup(req.content, 'html.parser')

56 csrf_middleware = soup.findAll('input', attrs={'name': 'csrfmiddlewaretoken'})[0]['value']

57 self.display_message('Retrieved token: %s' % csrf_middleware)

74 return []

75

76 soup = BeautifulSoup(req.content, 'html.parser')

77 tables = soup.findAll('table')

78

create-toolboxes.sh (https://github.com/bogdan2412/dotfiles.git) Shell · 51 lines

41

42 toolbox create $CREATE_ARGS -c python || true

43 toolbox run -c python sudo dnf install -y python3-beautifulsoup4 python3-html5lib python3-netifaces python3-pycodestyle

44 toolbox run -c python sudo dnf autoremove -y

45 toolbox run -c python sudo dnf clean all

VALLA.py (https://bitbucket.org/zhangjiejun/sjtuonlinejudge.git) Python · 75 lines

4 import urlparse

5 import urllib2

6 from bs4 import BeautifulSoup

7 from django.utils.encoding import smart_unicode

8

47 html = re.sub(r,'<br/>',html)

48

49 soup = BeautifulSoup(html)

50

51 link_tags = soup.find_all('link')

tempdir.patch (https://github.com/1000timesdead/portage.git) Patch · 71 lines

7 +import tempfile

8 +import shutil

9 # from BeautifulSoup import BeautifulSoup

10

11 global version

bills.py (https://github.com/jsoma/openstates.git) Python · 114 lines

2 import urllib

3 import re

4 from BeautifulSoup import BeautifulSoup

5

6 from fiftystates.scrape import NoDataForPeriod

44 #request page with list of all bills in year

45 with self.urlopen(search_url + '?' + params) as doc:

46 soup = BeautifulSoup(doc)

47

48 #parse results

bpython-settings.py (https://bitbucket.org/alexanderbohn/tessar.git) Python · 80 lines

57

58 ## web scraping stuff

59 from bs4 import BeautifulSoup

60

61 ## misc stuff

soupparser.py (https://github.com/jcrobak/hue.git) Python · 122 lines

7 from BeautifulSoup import \

8 BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString

9

10

11 def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):

12 """Parse a string of HTML data into an Element tree using the

13 BeautifulSoup parser.

25 def parse(file, beautifulsoup=None, makeelement=None, **bsargs):

26 """Parse a file into an ElemenTree using the BeautifulSoup parser.

27

28 You can pass a different BeautifulSoup parser through the

60 if beautifulsoup is None:

61 beautifulsoup = BeautifulSoup

62 if makeelement is None:

63 makeelement = html.html_parser.makeelement

defines.py (https://github.com/seppius-xbmc-repo/ru.git) Python · 124 lines

7 import threading

8 import os

9 from BeautifulSoup import BeautifulSoup

10

11 ADDON = xbmcaddon.Addon(id='script.torrent-tv.ru.pp')

111 def checkPort(params):

112 data = GET("http://2ip.ru/check-port/?port=%s" % params)

113 beautifulSoup = BeautifulSoup(data)

114 port = beautifulSoup.find('div', attrs={'class': 'ip-entry'}).text

pingback.py (https://github.com/emilian/django-blog-zinnia.git) Python · 140 lines

15 from zinnia.models import Entry

16 from zinnia.settings import PINGBACK_CONTENT_LENGTH

17 from BeautifulSoup import BeautifulSoup

18 from django_xmlrpc.decorators import xmlrpc_func

19

92 return TARGET_IS_NOT_PINGABLE

93

94 soup = BeautifulSoup(document)

95 title = soup.find('title')

96 title = title and strip_tags(title) or _('No title')

scrape.py (https://github.com/pcsforeducation/diveintopython.git) Python · 127 lines

1 from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup, Comment

2 import urllib

3 import os

13 try:

14 p = open('dip.html', 'r')

15 soup = BeautifulSoup(p.read())

16 except IOError, e:

17 print "io error code: %d msg: %s" % (e.returncode, e.message)

34 with open(filename, 'r') as f:

35

36 soup = BeautifulSoup(f)

37 print "working on %s" % (filename, )

38 for div in soup.findAll('div'):

95 soup.head.insert(len(soup.head.contents), code)

96

97 new_soup = BeautifulSoup(soup.renderContents())

98 for i in new_soup.findAll('a'):

99 if i.has_key('href'):

test_functional.py (https://github.com/encukou/deform.git) Python · 255 lines

34

35 def _soupify(self, html):

36 from BeautifulSoup import BeautifulSoup

37 return BeautifulSoup(html)

__init__.py (https://github.com/235/django-template-introspection.git) Python · 91 lines

2 from django.conf import settings

3 from django.template import Template, StringOrigin

4 from BeautifulSoup import BeautifulSoup, Tag

5 from hashlib import md5

6 import inspect

66

67 #add an attribute to each HTML-tag with a given hash or update existing

68 #WARNING: if the produced HTML is invalid, BeautifulSoup will try to fix it

69 soup = BeautifulSoup(output)