PageRenderTime 25ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/demos/lx_simple.py

https://github.com/jabbalaci/jabbapylib
Python | 141 lines | 128 code | 4 blank | 9 comment | 0 complexity | 555509f7eb5ceeead374fd95283fa5e3 MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. Working with lxml.
  4. HTML elements have all the methods that come with ElementTree, but also include
  5. some extra methods (see http://lxml.de/lxmlhtml.html, section HTML Element Methods).
  6. Element: http://docs.python.org/library/xml.etree.elementtree.html#element-objects.
  7. """
  8. from jabbapylib.web.scraper import lx, scraper, bs
  9. from jabbapylib.web.web import get_page
  10. def demo1():
  11. text = """
  12. <html>
  13. <table>
  14. <tr><td>Header</td></tr>
  15. <tr><td>Want This</td></tr>
  16. </table>
  17. <a href="http://google.ca">Google.ca</a>
  18. </html>
  19. """
  20. doc = lx.to_doc(text)
  21. row1 = doc.cssselect('table')[0]
  22. print row1.cssselect('tr td')[0].text
  23. print doc.cssselect('a[href]')[0].get('href')
  24. def demo2():
  25. url = 'http://projecteuler.net/'
  26. text = get_page(url)
  27. doc = lx.to_doc(text)
  28. lx.make_links_absolute(doc, base_url=url)
  29. print lx.tostring(doc)
  30. def demo3():
  31. html = '''<html>
  32. <head>
  33. <script type="text/javascript" src="stuff.js"></script>
  34. <link rel="alternate" type="text/rss" src="some-rss">
  35. <style>
  36. body {background-image: url(javascript:do_something)};
  37. div {color: expression(something)};
  38. </style>
  39. </head>
  40. <body onload="some_function()">
  41. Hello World!
  42. </body>
  43. </html>'''
  44. doc = lx.to_doc(html)
  45. print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
  46. def demo4():
  47. text = """
  48. <html>
  49. <table>
  50. <tr><td>http://google.ca</td></tr>
  51. <tr><td>http://reddit.com</td></tr>
  52. </table>
  53. </html>
  54. """
  55. doc = lx.to_doc(text)
  56. doc = lx.autolink(doc)
  57. print lx.prettify(doc)
  58. def demo5():
  59. text = """
  60. <html>
  61. <table>
  62. <tr><td>http://google.ca</td></tr>
  63. <tr><td>http://reddit.com</td></tr>
  64. </table>
  65. </html>
  66. """
  67. doc = lx.to_doc(text)
  68. lx.show_paths(doc)
  69. def demo6():
  70. text = """<ul>
  71. <li>abc</li>
  72. <li>def
  73. <li>ghi</li>
  74. </ul>"""
  75. doc = lx.to_doc(text)
  76. for li in doc.cssselect('ul li'):
  77. print li.text.strip()
  78. def demo7():
  79. text = """<html>
  80. <body
  81. <div></div>
  82. <div id="content">
  83. <ul>
  84. <li>First item</li>
  85. <li>Second item</li>
  86. </ul>
  87. </div>
  88. </body>
  89. </html>"""
  90. doc = lx.to_doc(text)
  91. lx.show_paths(doc)
  92. for tag in doc.cssselect('div#content ul li'):
  93. print tag.text
  94. print lx.css_to_xpath('div#content ul li')
  95. lx.open_in_browser(doc)
  96. def demo8():
  97. url = 'http://python.org/'
  98. text = get_page(url)
  99. #doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
  100. #doc = lx.to_doc(text)
  101. doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
  102. #print type(doc)
  103. #print etree.tostring(doc)
  104. title = doc.cssselect('html head title')[0]
  105. print title.text
  106. def demo9():
  107. url = 'http://python.org/'
  108. text = get_page(url)
  109. soup = bs.to_soup(text)
  110. title = soup.findCssSelect('html head title')[0]
  111. print title.text
  112. #############################################################################
  113. if __name__ == "__main__":
  114. #demo1()
  115. #demo2()
  116. #demo3()
  117. #demo4()
  118. #demo5()
  119. #demo6()
  120. #demo7()
  121. #demo8()
  122. demo9()
  123. pass