100+ results for 'beautifulsoup'
Not the results you expected?
examples.py (https://github.com/towerjoo/django-test-extensions.git) Python · 112 lines
feed.py (https://bitbucket.org/milos07p/pypsd-nao-on-git.git) Python · 194 lines
forms.py (https://github.com/stamen/fieldpapers.git) Python · 142 lines
thingSpider.py (https://github.com/enjrolas/Makerbot-Vending-Machine.git) Python · 138 lines
get_manmankan_images.py (https://github.com/mitnk/stuff.git) Python · 143 lines
11 import time
13 from BeautifulSoup import BeautifulSoup
15 def save_image_list_to_cache(dir_name, image_list):
32 def get_chapter_list(url):
33 page = urllib2.urlopen(url)
34 soup = BeautifulSoup(page, fromEncoding="gb18030")
35 print u"Reading information of %s ..." % soup.findAll("h1")[0].string
57 def get_image_list(url):
58 page = urllib2.urlopen(url)
59 soup = BeautifulSoup(page)
60 javascripts = soup.findAll(text=lambda text: text.parent.name == "script")
61 image_script = ""
Makefile (https://gitlab.com/lokiexinferis/vim-configs) Makefile · 82 lines
response.py (https://github.com/yoyo2k/scrapy.git) Python · 84 lines
Makefile (https://github.com/freebsd/freebsd-ports.git) Makefile · 90 lines
48 BTLNCK_DESC= Accelerate certain NaN evals via math/py-bottleneck
49 EXCEL_DESC= MS Excel I/O Add-ons
50 HTML5LIB_DESC= Parse HTML with www/py-html5lib and www/py-beautifulsoup
51 HTML_DESC= HTML Parsing/Generation Add-ons
52 JINJA2_DESC= Support conditional HTML formatting with devel/py-Jinja2
71 BOTO_RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}boto>0:devel/py-boto@${PY_FLAVOR}
72 BTLNCK_RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}bottleneck>=1.2.0:math/py-bottleneck@${PY_FLAVOR}
73 HTML5LIB_RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}beautifulsoup>=4.2.1:www/py-beautifulsoup@${PY_FLAVOR} \
74 ${PYTHON_PKGNAMEPREFIX}html5lib>0:www/py-html5lib@${PY_FLAVOR}
75 JINJA2_RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}Jinja2>0:devel/py-Jinja2@${PY_FLAVOR}
76 LXML_RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}beautifulsoup>=4.2.1:www/py-beautifulsoup@${PY_FLAVOR} \
77 ${PYTHON_PKGNAMEPREFIX}lxml>0:devel/py-lxml@${PY_FLAVOR}
78 MPL_RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}matplotlib>=2.0.0:math/py-matplotlib@${PY_FLAVOR}
ParseHtmlfromFile.py (https://github.com/PuercoPop/EleccionesPeru.git) Python · 85 lines
2 # -*- coding: utf-8
4 from BeautifulSoup import BeautifulSoup
5 import pdb
15 """
17 soup = BeautifulSoup( f_handle )
18 a = soup.findAll('tr',height="40")
22 T_Flag = False
23 for item in soup.findAll('tr'):
24 for item2 in BeautifulSoup(str(item)).findAll('span',{'class':'arial_contenido_negrita'}):
25 if T_Flag == True:
26 if item2.contents == []:
78 pass
79 #print item.contents
80 #b = BeautifulSoup.BeautifulSoup(str(a))
81 #c = BeautifulSoup.BeautifulSoup( str( b.find('td',align="left" ) ) )
settings.py (https://github.com/thenoviceoof/rooibos.git) Python · 64 lines
listing.py (https://github.com/4teamwork/ftw.pdfgenerator.git) Python · 178 lines
HelpIndex.py (https://github.com/esitarski/CrossMgr.git) Python · 98 lines
8 import glob
9 import re
10 from bs4 import BeautifulSoup
12 htmlDocDir = 'CrossMgrHtmlDoc'
44 # Extract content sections from the html pages.
45 for f in glob.iglob( os.path.join(htmlDocDir, '*.html') ):
46 doc = BeautifulSoup( open(f).read(), 'html.parser' )
47 div = doc.find('div', class_='content')
48 if not div:
PYopLib.py (https://bitbucket.org/y0no/pyopmail.git) Python · 78 lines
testFunctional.py (https://github.com/eaudeweb/Naaya.git) Python · 151 lines
1 import re
2 from BeautifulSoup import BeautifulSoup
4 from Products.Naaya.tests.NaayaFunctionalTestCase import NaayaFunctionalTestCase
140 self.browser.go('http://localhost/portal/myfolder')
141 html = self.browser.get_html()
142 soup = BeautifulSoup(html)
144 tables = soup.findAll('table', id='folderfile_list')
legacy.py (https://github.com/jlongman/xbmc-hockeystreams-plugin.git) Python · 174 lines
ieo.py (https://gitlab.com/rithvikvibhu/batch-sof) Python · 71 lines
2 import json
3 import pprint
4 from bs4 import BeautifulSoup
6 print "batch-sof: IEO\nAuthor: Rithvik\n-----------------\n"
46 r = requests.post("http://server1.sofworld.org/ieo-result/show.php", data = payload)
48 soup = BeautifulSoup(r.text, "html5lib") # Soup up html
49 table_data = [[cell.text for cell in row("td")]
50 for row in BeautifulSoup(r.text, "html5lib")("tr")]
imo.py (https://gitlab.com/rithvikvibhu/batch-sof) Python · 71 lines
2 import json
3 import pprint
4 from bs4 import BeautifulSoup
6 print "batch-sof: IMO\nAuthor: Rithvik\n-----------------\n"
46 r = requests.post("http://server1.sofworld.org/imo-result/show.php", data = payload)
48 soup = BeautifulSoup(r.text, "html5lib") # Soup up html
49 table_data = [[cell.text for cell in row("td")]
50 for row in BeautifulSoup(r.text, "html5lib")("tr")]
run.py (https://bitbucket.org/skywalking/loginparttimesystem.git) Python · 118 lines
4 import cookielib, optparse, setting, urllib, urllib2, sys
5 from BeautifulSoup import BeautifulSoup
6 from datetime import datetime
7 from time import sleep
27 def parse_signin(content, project = 1):
28 info = ()
29 bs = BeautifulSoup(content).findAll('tr')[project]
30 v = bs.findAll('td')
31 k = bs.find('input', {'name': 'signin'})
35 def parse_signout(content):
36 bs = BeautifulSoup(content).find('div', {'id': 'body'})
37 if bs.text == '您沒有簽到記錄,無法進行簽退 ....':
38 return 0
wiki_semantic.py (https://github.com/japerk/geopy.git) Python · 108 lines
7 try:
8 from BeautifulSoup import BeautifulSoup
9 except ImportError:
10 util.logger.warn("BeautifulSoup was not found. " \
30 def parse_rdf_link(self, page, mime_type='application/rdf+xml'):
31 """Parse the URL of the RDF link from the <head> of ``page``."""
32 soup = BeautifulSoup(page)
33 link = soup.head.find('link', rel='alternate', type=mime_type)
34 return link and link['href'] or None
80 util.logger.debug("Fetching %s..." % url)
81 page = urlopen(url)
82 soup = BeautifulSoup(page)
84 rdf_url = self.parse_rdf_link(soup)
toc.py (https://gitlab.com/janninematt/janninematt) Python · 145 lines
11 import re
13 from bs4 import BeautifulSoup, Comment
15 from pelican import contents, signals
118 title = content.metadata.get('title', 'Title')
119 tree = node = HtmlTreeNode(None, title, 'h0', '')
120 soup = BeautifulSoup(content._content, 'html.parser')
121 settoc = False
136 if (settoc):
137 tree_string = '{}'.format(tree)
138 tree_soup = BeautifulSoup(tree_string, 'html.parser')
139 content.toc = tree_soup.decode(formatter='html')
140 content._content = soup.decode(formatter='html')
main.py (https://gitlab.com/smidaharoun/devoirTunisiePython) Python · 197 lines
2 import urllib2
4 from bs4 import BeautifulSoup
5 from flask import Flask, jsonify
6 from flask import request
12 main = "https://www.devoir.tn/"
13 page = urllib2.urlopen(main)
14 soup = BeautifulSoup(page, 'html.parser')
15 soup.prettify()
67 url_level = main + request_link
68 page_level = urllib2.urlopen(url_level)
69 soup_level = BeautifulSoup(page_level, 'html.parser')
70 soup_level.prettify()
71 table = soup_level.find_all("div", {'class': re.compile(r'card br-1 bgb-amber.*')})
requirements_txt_linker_spec.rb (https://gitlab.com/wendy0402/gitlab-ce) Ruby · 95 lines
27 nose
28 nose-cov
29 beautifulsoup4
30 #
31 ###### Requirements with Version Specifiers ######
69 expect(subject).to include(link('nose', 'https://pypi.python.org/pypi/nose'))
70 expect(subject).to include(link('nose-cov', 'https://pypi.python.org/pypi/nose-cov'))
71 expect(subject).to include(link('beautifulsoup4', 'https://pypi.python.org/pypi/beautifulsoup4'))
72 expect(subject).to include(link('docopt', 'https://pypi.python.org/pypi/docopt'))
73 expect(subject).to include(link('keyring', 'https://pypi.python.org/pypi/keyring'))
index.rst (https://bitbucket.org/edhaker13/flexget.git) ReStructuredText · 128 lines
main_redis.py (https://github.com/jackfrued/Python-100-Days.git) Python · 156 lines
setup.py (https://github.com/AnneGilles/deform.git) Python · 78 lines
crawl.py (https://github.com/tarunrs/osu-events-server.git) Python · 70 lines
3 import time
4 from datetime import date, timedelta, datetime
5 from BeautifulSoup import BeautifulSoup, NavigableString, Tag
6 from Events import OSUEvents, Categories, Locations, Event_Types
7 from sqlalchemy.orm import sessionmaker
40 page_url = 'http://www.osu.edu/events/indexDay.php?Event_ID=&Date=' + str_date
41 html_doc = urllib2.urlopen(page_url).read()
42 soup = BeautifulSoup(html_doc)
43 events = soup.table.contents[3].td.findAll("p")
44 for e in events:
getAWSdocs.py (https://github.com/richarvey/getAWSdocs.git) Python · 158 lines
1 #!/usr/bin/env python3
3 from bs4 import BeautifulSoup
4 import os, argparse
5 from urllib.parse import urlparse, urlsplit
19 html_page = urlopen(start_page)
20 # Parse the HTML page
21 soup = BeautifulSoup(html_page, 'html.parser')
22 pdfs = set()
23 print("Generating PDF list (this may take some time)")
38 def find_pdfs_in_html(url):
39 html_page_doc = urlopen(url)
40 soup_doc = BeautifulSoup(html_page_doc, 'html.parser')
41 # Get the A tag from the parsed page
42 pdfs = set()
helper.py (https://bitbucket.org/macdylan/lbforum.git) Python · 44 lines
README.rst (https://github.com/liberation/django_compressor.git) ReStructuredText · 71 lines
35 Django Compressor is highly configurable and extendible. The HTML parsing
36 is done using lxml_ or if it's not available Python's built-in HTMLParser by
37 default. As an alternative Django Compressor provides a BeautifulSoup_ and a
38 html5lib_ based parser, as well as an abstract base class that makes it easy to
39 write a custom parser.
57 ``pip install django_compressor==dev`` or ``easy_install django_compressor==dev``.
59 .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
60 .. _lxml: http://lxml.de/
61 .. _html5lib: http://code.google.com/p/html5lib/
BeautifulSupe.py (https://github.com/mbologna/BitFountain.git) Python · 132 lines
independent.py (https://gitlab.com/harrigan/TPP) Python · 36 lines
single.py (https://gitlab.com/skororu/pysnippets) Python · 67 lines
7 import random # randint
9 import bs4 # BeautifulSoup
10 import requests # codes.ok, get
28 # access the permanent URL for the selected cartoon
29 req = requests.get(url)
30 page = bs4.BeautifulSoup(req.text, 'lxml')
32 # obtain cartoon strip title from the ctitle block
44 # so our random selection has an upper bound
45 with requests.get(base_url) as req:
46 page = bs4.BeautifulSoup(req.text, 'lxml')
47 num_previous = page.find('a', rel='prev')['href']
48 upper_limit = int(num_previous[1:-1]) + 1
parseDelscrHTM-1.py (https://github.com/gtani7/pyrb--python-scrape-spider.git) Python · 61 lines
9 # url_re=re.compile(r'<a href="(.*)" onmousedown=')
10 def __init__(self,mag_file): # mag_file is string, not file obj
11 from BeautifulSoup import BeautifulSoup
12 soup = BeautifulSoup(mag_file)
26 def __init__(self,red_file):
27 from BeautifulSoup import BeautifulSoup
28 soup = BeautifulSoup(red_file)
geoserver.py (https://github.com/dotskapes/dotSkapes.git) Python · 104 lines
version_check.py (https://gitlab.com/mimizone/kolla) Python · 126 lines
hupu_redis.py (https://github.com/rieuse/learnPython.git) Python · 67 lines
voc_det_generator.py (https://github.com/donnyyou/torchcv.git) Python · 138 lines
catalogparser.py (https://github.com/jeffh/YACS.git) Python · 106 lines
1 import urllib2
2 import re
3 from BeautifulSoup import BeautifulSoup
4 from rpi_courses.config import DEPARTMENTS
39 def get_course_detail(course_page):
40 course_page = re.sub('<br */?>', '\n', course_page)
41 soup = BeautifulSoup(course_page, convertEntities=BeautifulSoup.HTML_ENTITIES)
42 title_text = soup.findAll('h1 h2 h3 h4 h5 h6'.split(' '))[0].text
43 title = re.search('([\w+\s]+) (\d+\w+) \- (.*)', title_text)
scrape.py (https://github.com/sneeu/aliss_scrapers.git) Python · 113 lines
zad_7.py (https://gitlab.com/mmeisel/LV) Python · 43 lines
7 import re
8 import urllib
9 from bs4 import BeautifulSoup
11 def ripDomain(): #funkcija koja "cupa" domenu iz linka
25 urlAddr=correctURL() #ispravlja se url ukoliko je potrebno
26 html=urllib.urlopen(urlAddr, "lxml").read() #otvara se url
27 soup=BeautifulSoup(html) #i deklarira objekt tipa BeautifulSoup
29 domain=ripDomain() #trazi se domena
scraping-the-web.rst (https://github.com/EnTeQuAk/pydanny-event-notes.git) ReStructuredText · 109 lines
test-broken-html.py (https://github.com/JonathanRRogers/twill.git) Python · 181 lines
25 commands.config('use_tidy', '0')
26 commands.config('use_BeautifulSoup', '0')
27 commands.config('allow_parse_errors', '0')
61 commands.config('use_tidy', '1')
62 commands.config('use_BeautifulSoup', '0')
63 commands.config('allow_parse_errors', '0')
89 # pass
91 def test_BeautifulSoup():
92 """
93 test parsing of BS-processed HTML.
conversation.py (https://gitlab.com/sanchezfauste/TweetDigraph) Python · 108 lines
burp-to-sqlmap.py (https://github.com/Miladkhoshdel/burp-to-sqlmap.git) Python · 191 lines
2 import sys
3 import os
4 from bs4 import BeautifulSoup
5 import os.path
6 import argparse
103 print(" [+] Exporting Packets ...")
104 with open(filename, 'r') as f:
105 soup = BeautifulSoup(f.read(), "html.parser")
106 for i in soup.find_all("request"):
107 packetnumber = packetnumber + 1
144 with open(filename, 'r') as f:
145 soup = BeautifulSoup(f.read(), "html.parser")
146 for i in soup.find_all("request"):
147 packetnumber = packetnumber + 1
bills.py (https://github.com/runderwood/openstates.git) Python · 138 lines
4 import datetime as dt
5 import urllib2
6 from BeautifulSoup import BeautifulSoup
8 from fiftystates.scrape.bills import BillScraper, Bill
11 '''Remove some irregularities from WV's HTML.
13 It includes a spurious </HEAD> before the useful data begins and lines like '<option value="Bill"selected="selected">Bill</option>', in which the lack of a space between the attributes confuses BeautifulSoup.
14 '''
15 data = data.replace('</HEAD>', '')
68 if not sessionexisted(data):
69 return False
70 soup = BeautifulSoup(cleansource(data))
71 rows = soup.findAll('table')[1].findAll('tr')[1:]
72 for row in rows:
testFunctional.py (https://github.com/eaudeweb/Naaya.git) Python · 162 lines
1 import re
2 from unittest import TestSuite, makeSuite
3 from BeautifulSoup import BeautifulSoup
5 from Products.Naaya.tests.NaayaFunctionalTestCase import NaayaFunctionalTestCase
146 self.browser.go('http://localhost/portal/myfolder')
147 html = self.browser.get_html()
148 soup = BeautifulSoup(html)
150 tables = soup.findAll('table', id='folderfile_list')
hdtrailers.py (https://github.com/ryanrdetzel/CouchPotato.git) Python · 128 lines
1 from app.config.cplog import CPLog
2 from app.lib.provider.rss import rss
3 from imdb.parser.http.bsouplxml._bsoup import SoupStrainer, BeautifulSoup
4 from string import letters, digits
5 from urllib import urlencode
68 try:
69 tables = SoupStrainer('div')
70 html = BeautifulSoup(data, parseOnlyThese = tables)
71 resultTable = html.findAll('h2', text = re.compile(movie))
92 try:
93 tables = SoupStrainer('table')
94 html = BeautifulSoup(data, parseOnlyThese = tables)
95 resultTable = html.find('table', attrs = {'class':'bottomTable'})
pingback.py (https://github.com/sumegha/django-gstudio.git) Python · 140 lines
15 from gstudio.models import Nodetype
16 from gstudio.settings import PINGBACK_CONTENT_LENGTH
17 from BeautifulSoup import BeautifulSoup
18 from django_xmlrpc.decorators import xmlrpc_func
92 return TARGET_IS_NOT_PINGABLE
94 soup = BeautifulSoup(document)
95 title = soup.find('title')
96 title = title and strip_tags(title) or _('No title')
__init__.py (https://github.com/palli81/headphones.git) Python · 96 lines
40 treeType - the name of the tree type required (case-insensitive). Supported
41 values are "simpletree", "dom", "etree" and "beautifulsoup"
43 "simpletree" - a built-in DOM-ish tree type with support for some
50 elementtree-like interface (known to work with
51 ElementTree, cElementTree and lxml.etree).
52 "beautifulsoup" - Beautiful soup (if installed)
54 implementation - (Currently applies to the "etree" and "dom" tree types). A
69 import simpletree
70 treeBuilderCache[treeType] = simpletree.TreeBuilder
71 elif treeType == "beautifulsoup":
72 import soup
73 treeBuilderCache[treeType] = soup.TreeBuilder
shorter.py (https://bitbucket.org/badc0re/xsser_gsoc.git) Python · 76 lines
vt_hash2filenames.py (https://bitbucket.org/Vnoxygen/malformity.git) Python · 43 lines
bootstrap.py (https://github.com/Huawei/containerops.git) Python · 185 lines
utils.py (https://github.com/Gautier/django-page-cms.git) Python · 139 lines
release.py (https://gitlab.com/LocutusOfPenguin/python-chess) Python · 178 lines
pyurllib.py (https://github.com/TsingJyujing/DataSpider.git) Python · 196 lines
transforms.py (https://github.com/giacomos/jarn.xmpp.core.git) Python · 94 lines
types.py (https://github.com/rxuriguera/bibtexIndexMaker.git) Python · 87 lines
importer.py (https://github.com/gregmalcolm/Bookie.git) Python · 201 lines
1 """Importers for bookmarks"""
2 from datetime import datetime
3 from BeautifulSoup import BeautifulSoup
4 from bookie.models import BmarkMgr
77 delicious_doctype = "DOCTYPE NETSCAPE-Bookmark-file-1"
79 soup = BeautifulSoup(file_io)
80 can_handle = False
81 can_handle = DelImporter._is_delicious_format(soup,
89 def process(self, fulltext=None):
90 """Given a file, process it"""
91 soup = BeautifulSoup(self.file_handle)
93 for tag in soup.findAll('dt'):
test_microformats.py (https://bitbucket.org/inirudebwoy/gdziebylkaziu.git) Python · 155 lines
4 from geopy.parsers.html import GeoMicroformat
5 try:
6 from BeautifulSoup import BeautifulSoup
7 except ImportError:
8 BeautifulSoup = None
25 def test_one_soup(self):
26 if BeautifulSoup:
27 locations = self.parser.find_all(BeautifulSoup(self.MARKUP))
31 def test_multi_soup(self):
32 if BeautifulSoup:
33 locations = self.parser.find_all(BeautifulSoup(self.MARKUP * 3))
50 def test_none_soup(self):
51 if BeautifulSoup:
52 locations = self.parser.find_all(BeautifulSoup(self.MARKUP))
share_post.py (https://gitlab.com/janninematt/janninematt) Python · 81 lines
7 """
9 from bs4 import BeautifulSoup
10 try:
11 from urllib.parse import quote
18 def article_title(content):
19 main_title = BeautifulSoup(content.title, 'html.parser').get_text().strip()
20 sub_title = ''
21 if hasattr(content, 'subtitle'):
22 sub_title = ' ' + BeautifulSoup(content.subtitle, 'html.parser').get_text().strip()
23 return quote(('%s%s' % (main_title, sub_title)).encode('utf-8'))
layouttestresults.py (https://gitlab.com/x33n/phantomjs) Python · 91 lines
setup.py (https://github.com/eged/django-blog-zinnia.git) Python · 37 lines
test_html.py (https://github.com/openhatch/oh-mainline.git) Python · 151 lines
26 list(br.links())
28 def test_robust_form_parser_uses_beautifulsoup(self):
29 factory = mechanize.RobustFormsFactory()
30 self.assertIs(factory.form_parser_class,
31 mechanize._form.RobustFormParser)
33 def test_form_parser_does_not_use_beautifulsoup(self):
34 factory = mechanize.FormsFactory()
35 self.assertIs(factory.form_parser_class, mechanize._form.FormParser)
baidu_spider_progress.py (https://github.com/kkyon/botflow.git) Python · 123 lines
1 from botflow import Pipe, Branch, Timer
2 from botflow import BotFlow
3 from bs4 import BeautifulSoup
4 from dataclasses import dataclass
5 from botflow.ex.http import HttpLoader
27 # 解析具体条目
28 def get_all_items(response):
29 soup = BeautifulSoup(response.text, "lxml")
30 items = soup.select('div.result.c-container')
31 result = []
46 #BD_URL='https://180.97.33.108' #
47 BD_URL='https://www.baidu.com'
48 soup = BeautifulSoup(response.text, "lxml")
49 page = soup.select('div#page')
50 for item in page[0].find_all('a'):
Python爬取日记八:斗鱼弹幕相关信息保存到mongodb.py (https://github.com/rieuse/learnPython.git) Python · 94 lines
feats.py (https://github.com/devonjones/PSRD-Parser.git) Python · 153 lines
2 import json
3 import re
4 from BeautifulSoup import BeautifulSoup
5 from psrd.rules import write_rules
6 from psrd.files import char_replace
85 p = find_section(feat, name="Prerequisites", section_type='section')
86 if p != None:
87 soup = BeautifulSoup(p['text'])
88 p['description'] = ''.join(soup.findAll(text=True))
89 del p['text']
117 feat['sections'].remove(section)
118 if feat.has_key('text') and not feat.has_key('description'):
119 soup = BeautifulSoup(feat['text'])
120 feat['description'] = ''.join(soup.findAll(text=True))
121 del feat['text']
plugin.py (https://github.com/gsf/supybot-plugins.git) Python · 47 lines
download_russian_contrast.py (https://bitbucket.org/Meister17/term-extraction.git) Python · 108 lines
refreshportlet.py (https://github.com/plone/plone.app.kss.git) Python · 64 lines
1 from zope.deprecation import deprecate
3 from kss.core.BeautifulSoup import BeautifulSoup
4 from kss.core import CommandSet
5 from plone.app.portlets.utils import assignment_from_key
26 # So we just select the <dl> for insertion.
27 # This could be spared with smarter templating.
28 soup = BeautifulSoup(portlet_body)
29 tag = soup.find('dl', id=nodeid)
30 result = unicode(tag)
middleware.py (https://github.com/ralphbean/raptorizemw.git) Python · 151 lines
2 import BeautifulSoup
3 import datetime
4 import random
102 """
104 soup = BeautifulSoup.BeautifulSoup(resp.body)
106 if not soup.html:
109 if not soup.html.head:
110 soup.html.insert(0, BeautifulSoup.Tag(soup, "head"))
112 prefix = self.resources_app.prefix
113 js_helper = BeautifulSoup.Tag(
114 soup, "script", attrs=[
115 ('type', 'text/javascript'),
utils.py (https://github.com/nielssprong/lernanta.git) Python · 116 lines
10 from BeautifulSoup import BeautifulSoup
12 from django.conf import settings
69 both.
70 """
71 soup = BeautifulSoup(content)
72 links = soup.findAll('link')
74 # BeautifulSoup instances are not actually dictionaries, so
75 # we can't use the more proper 'key in dict' syntax and
76 # must instead use the deprecated 'has_key()' method.
Makefile (https://github.com/freebsd/freebsd-ports.git) Makefile · 34 lines
16 ${PYTHON_PKGNAMEPREFIX}keyring>0:security/py-keyring@${PY_FLAVOR} \
17 ${PYTHON_PKGNAMEPREFIX}psutil>=2.0:sysutils/py-psutil@${PY_FLAVOR}
18 RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}beautifulsoup>=4.2.1:www/py-beautifulsoup@${PY_FLAVOR} \
19 ${PYTHON_PKGNAMEPREFIX}importlib-metadata>0:devel/py-importlib-metadata@${PY_FLAVOR} \
20 ${PYTHON_PKGNAMEPREFIX}keyring>0:security/py-keyring@${PY_FLAVOR} \
pingback.py (https://github.com/aparo/django-blog-zinnia.git) Python · 141 lines
16 from zinnia.managers import PINGBACK
17 from zinnia.settings import PINGBACK_CONTENT_LENGTH
18 from BeautifulSoup import BeautifulSoup
19 from django_xmlrpc.decorators import xmlrpc_func
93 return TARGET_IS_NOT_PINGABLE
95 soup = BeautifulSoup(document)
96 title = soup.find('title')
97 title = title and strip_tags(title) or _('No title')
shotchart_cbssports.py (https://github.com/kpascual/nbascrape.git) Python · 127 lines
search_opportunities.py (https://github.com/PacktPublishing/Python-Automation-Cookbook.git) Python · 150 lines
8 import delorean
9 import requests
10 from bs4 import BeautifulSoup
11 import mistune
12 import jinja2
61 # Get the article
62 response = requests.get(entry.link)
63 article = BeautifulSoup(response.text, 'html.parser')
64 article_reference = (article.title.string.strip(),
65 entry.summary.strip(),
CalendarBuilder.py (https://github.com/levyd/CalendarBuilder.git) Python · 70 lines
1 from sys import stderr
2 from BeautifulSoup import BeautifulSoup
3 from icalendar import Calendar, Event, vRecur
4 from datetime import datetime, timedelta
39 def parse(self, infile):
40 """Parses the dalonline HTML into a schedule for one week"""
41 doc = BeautifulSoup(infile.read())
43 # Get the base date (Monday) of the webpage's calendar
lx_simple.py (https://github.com/jabbalaci/jabbapylib.git) Python · 141 lines
53 </html>'''
54 doc = lx.to_doc(html)
55 print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
57 def demo4():
114 #doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
115 #doc = lx.to_doc(text)
116 doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
117 #print type(doc)
118 #print etree.tostring(doc)
extraer_datos_composicion_alimentos.py (https://gitlab.com/FoodUpProject/FoodUp) Python · 54 lines
1 # -*- coding: utf-8 -*-
2 import urllib2,unicodedata
3 from bs4 import BeautifulSoup
5 #método de análisis de una dirección web
6 def analisisDescarga(archivo,conexion):
7 html = conexion.read()
8 soup = BeautifulSoup(html)
9 #obtenemos una lista de String con la condición de atributos class con valores details y price
10 links = soup.find_all(True, {'align':['left','right']})
html.py (https://bitbucket.org/charlisim/search_url_web.git) Python · 101 lines
createspace-scraper.py (https://github.com/russx2/createspace-scraper.git) Python · 88 lines
2 import sys
3 import requests
4 from BeautifulSoup import BeautifulSoup
6 def get_sales(email, password, date_start, date_end):
53 r = session.post('https://www.createspace.com/pub/reports/ajax/table.salesdetails.do?sid=' + token + '&msk=mr')
55 markup = BeautifulSoup(r.content)
56 markupHeadingBlock = markup.find('tr', {'class': 'head2'})
57 totalQuantity = markupHeadingBlock.find(text = re.compile('\d+'))
get_legislation.py (https://github.com/gosuri/fiftystates.git) Python · 118 lines
porn4days.py (https://github.com/alfa-addon/addon.git) Python · 147 lines
discord_insult_spam_dm.py (https://github.com/Merubokkusu/discord-spam-bots.git) Python · 114 lines
18 import os
19 import random
20 from bs4 import BeautifulSoup
21 sys.path.append("./.")
22 from config import *
54 file.close()
55 html = urllib.request.urlopen("https://insult.mattbas.org/api/insult.html").read()
56 soup = BeautifulSoup(html,"html.parser")
57 insult_text = soup.find('h1')
58 print(insult_text.text)
81 file.close()
82 html = urllib.request.urlopen("https://insult.mattbas.org/api/insult.html").read()
83 soup = BeautifulSoup(html,"html.parser")
84 insult_text = soup.find('h1')
85 print(insult_text.text)
lxmlselector.py (https://github.com/steeve/scrapy-lxmlselector.git) Python · 154 lines
2 Lxml selector
3 Provides both XPath and CSS Selection.
4 Can use html5lib and BeautifulSoup.
6 Provided by Steeve Morin <steeve.morin@gmail.com>
26 def __init__(self, response=None, text=None, node=None, parent=None, expr=None,
27 use_html5lib=False, use_BeautifulSoup=False, namespaces=None):
28 if parent:
29 self.doc = parent.doc
31 elif response:
32 self.xmlNode = self._lxml_parse_document(response.body, use_html5lib,
33 use_BeautifulSoup)
34 self.doc = self.xmlNode.getroottree()
35 elif text:
mgstage.py (https://github.com/yoshiko2/AV_Data_Capture.git) Python · 129 lines
4 from lxml import etree
5 import json
6 from bs4 import BeautifulSoup
7 from ADC_function import *
8 # import sys
99 number=number2.upper()
100 htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
101 soup = BeautifulSoup(htmlcode, 'lxml')
102 a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
103 b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
browser.py (https://gitlab.com/phyks/weboob) Python · 120 lines
21 from weboob.deprecated.browser import Browser, BrowserIncorrectPassword
22 from weboob.deprecated.browser.parsers.iparser import IParser
23 import BeautifulSoup
25 from .pages import PagePrivateThreadsList, PagePrivateThread, PageLogin, PageIndex, DummyPage, PageUserProfile, PageCityList
31 class SoupParser(IParser):
32 def parse(self, data, encoding=None):
33 return BeautifulSoup.BeautifulSoup(data.read().decode(encoding or 'utf-8'), convertEntities=BeautifulSoup.BeautifulStoneSoup.ALL_ENTITIES)
pipelines.py (https://github.com/richshaw2015/oh-my-rss.git) Python · 142 lines
2009-2-6-my-macheist-release-estimate.markdown (https://github.com/FranklinChen/mattfoster.github.com.git) Markdown · 42 lines
14 <div class="thumbnail"><a href="http://skitch.com/mattfoster/bd2ut/macheist-mainframe"><img src="http://img.skitch.com/20090206-nd331ywttf11ypf684ehdr5scr.preview.jpg" alt="MacHeist: Mainframe" /></a><br /><span style="font-family: Lucida Grande, Trebuchet, sans-serif, Helvetica, Arial; font-size: 10px; color: #808080">Uploaded with <a href="http://plasq.com/">plasq</a>'s <a href="http://skitch.com">Skitch</a>!</span></div>
16 I used [BeautifulSoup](http://crummy.com/software/BeautifulSoup "Beautiful Soup: We called him Tortoise because he taught us.") with [urllib2](http://docs.python.org/library/urllib2.html "urllib2 — extensible library for opening URLs — Python v2.6.1 documentation") and a tiny regular expression, and here's the result:
18 <script src="http://gist.github.com/59370.js"></script>
congress.py (https://github.com/michaelmyers/python-congress.git) Python · 138 lines
11 from datetime import datetime
12 from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
13 from openpyxl.workbook import Workbook
14 from openpyxl.writer.excel import ExcelWriter
123 url = 'http://clerk.house.gov/evs/' + str(year) + '/index.asp'
124 page = urllib2.urlopen(url)
125 soup = BeautifulSoup(page)
126 text = soup.find('a')
ny_times_pre_1981_scraper.py (https://github.com/slifty/rdiscraper.git) Python · 101 lines
scrape.py (https://gitlab.com/mkhouri/news_scraper) Python · 72 lines
ece301.py (https://github.com/kuruoujou/Course-Note-Grabber.git) Python · 174 lines
4 os.putenv("DISPLAY",":0.0")
6 from BeautifulSoup import BeautifulSoup
7 import re
8 from urllib2 import urlopen
34 #Downloads past and upcoming exam info...
35 page = urlopen(home)
36 soup = BeautifulSoup(page)
37 links = soup.findAll(href=re.compile('.*?\.pdf'))
38 eicount = 0
52 #Downloads Course Docs
53 page = urlopen(docs)
54 soup = BeautifulSoup(page)
55 links = soup.findAll(href=re.compile('.*?\.pdf|.*?\.html?'))
56 dcount = 0
ci.py (https://github.com/pwxcoo/chinese-xinhua.git) Python · 72 lines
9 import requests, csv
10 from bs4 import BeautifulSoup
11 import time
12 from multiprocessing.dummy import Pool as ThreadPool
26 print(f'{url} is parsing')
27 html = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")
28 a = html.find_all('a', target="_blank")
36 try:
37 response = requests.get(words[i])
38 wordhtml = BeautifulSoup(response.content.decode('gbk', errors='ignore').replace('<br/>', '\n').replace('<br>', '\n')\
39 , "lxml")
40 td = wordhtml.find_all('table')[5].find_all('td')
PostNewsController.java (https://github.com/fuyunwang/ChengFeng1.5.git) Java · 142 lines
1 package com.beautifulsoup.chengfeng.controller.community;
3 import com.beautifulsoup.chengfeng.common.ResponseResult;
4 import com.beautifulsoup.chengfeng.controller.vo.PostNewsDetailVo;
5 import com.beautifulsoup.chengfeng.controller.vo.PostNewsVo;
6 import com.beautifulsoup.chengfeng.controller.vo.PostReplyVo;
7 import com.beautifulsoup.chengfeng.controller.vo.PosterVo;
8 import com.beautifulsoup.chengfeng.pojo.Journalism;
9 import com.beautifulsoup.chengfeng.pojo.PostNews;
10 import com.beautifulsoup.chengfeng.service.PostNewsService;
11 import com.beautifulsoup.chengfeng.service.dto.PostNewsDto;
create-manual.py (git://github.com/residuum/PuRestJson.git) Python · 68 lines
test_sitegen.py (https://gitlab.com/Ivy001/pants) Python · 213 lines
html.py (https://github.com/feyin/lamson.git) Python · 180 lines
25 """
27 from BeautifulSoup import BeautifulSoup
28 import clevercss
29 from lamson import mail, view
93 """
94 Used mostly internally but helpful for testing, this takes the given HTML
95 and applies the configured CSS you've set. It returns a BeautifulSoup
96 object with all the style attributes set and nothing else changed.
97 """
98 doc = BeautifulSoup(html)
99 roots = {} # the roots rarely change, even though the paths do
get_kottke.py (https://github.com/wilson428/Robottke.git) Python · 97 lines
7 from datetime import datetime
9 from BeautifulSoup import BeautifulSoup
11 prefix = '../'
33 url = "http://kottke.org/" + syear + "/" + smonth + "/"
34 print "<-----------------------" + url
35 soup = BeautifulSoup(urllib2.urlopen(url))
36 for entry in soup.findAll('div', { "class" : "post" }):
37 try:
soup.py (https://gitlab.com/tlevine/dexy) Python · 132 lines
1 from bs4 import BeautifulSoup
2 from dexy.filter import DexyFilter
3 from dexy.utils import chdir
12 Add <script> tags or <link> tags to an HTML file's header.
14 Uses BeautifulSoup.
15 """
16 aliases = ['customize']
23 def process_text(self, input_text):
24 soup = BeautifulSoup(input_text)
26 for js in self.setting('scripts'):
42 _settings = {
43 'html-parser' : ("Name of html parser BeautifulSoup should use.", 'html.parser'),
44 'inline-images' : ("Whether to inline images using the data uri scheme.", True),
45 'inline-styles' : ("Whether to embed referenced CSS in the page header.", True)
institution_test_suite.py (https://github.com/adsabs/scripts-affiliation-disambiguation.git) Python · 187 lines
test_lxml.py (https://github.com/openhatch/oh-mainline.git) Python · 91 lines
Makefile (https://bitbucket.org/bendikro/deluge-yarss-plugin.git) Makefile · 130 lines
73 @echo "Build finished; now you can run "qcollectiongenerator" with the" \
74 ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
75 @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BeautifulSoup.qhcp"
76 @echo "To view the help file:"
77 @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BeautifulSoup.qhc"
82 @echo "Build finished."
83 @echo "To view the help file:"
84 @echo "# mkdir -p $$HOME/.local/share/devhelp/BeautifulSoup"
85 @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BeautifulSoup"
ServantStats.py (https://bitbucket.org/TheMysteryofDoom/doom-utilityapp.git) Python · 105 lines
2 import re
3 from discord.ext import commands
4 try: # check if BeautifulSoup4 is installed
5 from bs4 import BeautifulSoup
32 url = "http://fategrandorder.wikia.com/wiki/"+searcharg
33 async with aiohttp.get(url) as response:
34 soup = BeautifulSoup(await response.text(), 'html.parser')
35 try:
36 base = soup.find("div", {"class": "ServantInfoStatsWrapper"})
102 bot.add_cog(ServantStats(bot))
103 else:
104 raise RuntimeError("You need to run `pip3 install beautifulsoup4`")
translate-update.py (https://github.com/telegram-zhCN/telegram-language-resources.git) Python · 65 lines
2 import os
3 import requests
4 from bs4 import BeautifulSoup
6 identification = os.environ.get('TRANSIFEX_USERNAME')
22 s = requests.session()
23 r = s.get('https://www.transifex.com/signin/')
24 soup = BeautifulSoup(r.text, 'html.parser')
25 csrftoken = soup.find('input', {'name': 'csrfmiddlewaretoken'})['value']
test_flask_get.py (https://github.com/fredrik-corneliusson/click-web.git) Python · 74 lines
snippet.py (https://github.com/gistable/gistable.git) Python · 180 lines
statusserver.py (https://github.com/DooMLoRD/Xperia-2011-Official-Kernel-Sources.git) Python · 96 lines
parser.py (https://github.com/vishnevskiy/bbcodepy.git) Python · 112 lines
DNSDumpsterAPI.py (https://github.com/m0rtem/CloudFail.git) Python · 84 lines
10 import requests
12 from bs4 import BeautifulSoup
54 req = s.get(dnsdumpster_url)
55 soup = BeautifulSoup(req.content, 'html.parser')
56 csrf_middleware = soup.findAll('input', attrs={'name': 'csrfmiddlewaretoken'})[0]['value']
57 self.display_message('Retrieved token: %s' % csrf_middleware)
74 return []
76 soup = BeautifulSoup(req.content, 'html.parser')
77 tables = soup.findAll('table')
create-toolboxes.sh (https://github.com/bogdan2412/dotfiles.git) Shell · 51 lines
VALLA.py (https://bitbucket.org/zhangjiejun/sjtuonlinejudge.git) Python · 75 lines
tempdir.patch (https://github.com/1000timesdead/portage.git) Patch · 71 lines
bills.py (https://github.com/jsoma/openstates.git) Python · 114 lines
bpython-settings.py (https://bitbucket.org/alexanderbohn/tessar.git) Python · 80 lines
soupparser.py (https://github.com/jcrobak/hue.git) Python · 122 lines
7 from BeautifulSoup import \
8 BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
11 def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
12 """Parse a string of HTML data into an Element tree using the
13 BeautifulSoup parser.
25 def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
26 """Parse a file into an ElemenTree using the BeautifulSoup parser.
28 You can pass a different BeautifulSoup parser through the
60 if beautifulsoup is None:
61 beautifulsoup = BeautifulSoup
62 if makeelement is None:
63 makeelement = html.html_parser.makeelement
defines.py (https://github.com/seppius-xbmc-repo/ru.git) Python · 124 lines
7 import threading
8 import os
9 from BeautifulSoup import BeautifulSoup
11 ADDON = xbmcaddon.Addon(id='script.torrent-tv.ru.pp')
111 def checkPort(params):
112 data = GET("http://2ip.ru/check-port/?port=%s" % params)
113 beautifulSoup = BeautifulSoup(data)
114 port = beautifulSoup.find('div', attrs={'class': 'ip-entry'}).text
pingback.py (https://github.com/emilian/django-blog-zinnia.git) Python · 140 lines
15 from zinnia.models import Entry
16 from zinnia.settings import PINGBACK_CONTENT_LENGTH
17 from BeautifulSoup import BeautifulSoup
18 from django_xmlrpc.decorators import xmlrpc_func
92 return TARGET_IS_NOT_PINGABLE
94 soup = BeautifulSoup(document)
95 title = soup.find('title')
96 title = title and strip_tags(title) or _('No title')
scrape.py (https://github.com/pcsforeducation/diveintopython.git) Python · 127 lines
1 from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup, Comment
2 import urllib
3 import os
13 try:
14 p = open('dip.html', 'r')
15 soup = BeautifulSoup(p.read())
16 except IOError, e:
17 print "io error code: %d msg: %s" % (e.returncode, e.message)
34 with open(filename, 'r') as f:
36 soup = BeautifulSoup(f)
37 print "working on %s" % (filename, )
38 for div in soup.findAll('div'):
95 soup.head.insert(len(soup.head.contents), code)
97 new_soup = BeautifulSoup(soup.renderContents())
98 for i in new_soup.findAll('a'):
99 if i.has_key('href'):
test_functional.py (https://github.com/encukou/deform.git) Python · 255 lines
__init__.py (https://github.com/235/django-template-introspection.git) Python · 91 lines
2 from django.conf import settings
3 from django.template import Template, StringOrigin
4 from BeautifulSoup import BeautifulSoup, Tag
5 from hashlib import md5
6 import inspect
67 #add an attribute to each HTML-tag with a given hash or update existing
68 #WARNING: if the produced HTML is invalid, BeautifulSoup will try to fix it
69 soup = BeautifulSoup(output)