/decoder/forms.py

https://github.com/stamen/fieldpapers · Python · 142 lines · 95 code · 22 blank · 25 comment · 15 complexity · 06730ee8aa1c3e6da4a7abc3274f232d MD5 · raw file

  1. # placeholder module for form HTML parsing.
  2. import sys
  3. from urllib import urlopen
  4. from urlparse import urljoin
  5. import json
  6. from BeautifulSoup import BeautifulSoup
  7. from apiutils import finish_form, fail_form
  8. def fields_as_text(form_fields):
  9. """
  10. """
  11. labels = [field['label'] for field in form_fields]
  12. text = '\n\n\n\n'.join(labels)
  13. return text
  14. def get_form_fields(url):
  15. """ Gets a data structure of form fields for an HTML form URL, return a dictionary.
  16. """
  17. page = urlopen(url)
  18. soup = BeautifulSoup(page)
  19. form = soup.form
  20. # Setting up data structure
  21. form_data = dict(fields=[])
  22. form_attr = dict(form.attrs)
  23. form_data['title'] = soup.h1 and soup.h1.text or soup.title.text
  24. form_data['action'] = urljoin(url, form_attr['action'])
  25. form_data['method'] = form_attr['method']
  26. # Get a list of the entry labels
  27. labels = form.findAll(['label'], {"class": "ss-q-title"})
  28. label_contents = []
  29. for label in labels:
  30. label_contents.append({label.attrs[1][0]: label.attrs[1][1], 'contents': label.contents[0]})
  31. #print label_contents
  32. #
  33. # Handle text input boxes
  34. #
  35. textboxes = form.findAll(['input'], {"type": "text"})
  36. #textbox_description = {}
  37. for textbox in textboxes:
  38. textbox_description = {}
  39. for index, label in enumerate(label_contents):
  40. if label_contents[index]['for'] == textbox['id']:
  41. #print label_contents[index]['contents'].strip()
  42. textbox_description['label'] = label_contents[index]['contents'].strip()
  43. break
  44. abbreviated_attributes = dict((k,v) for (k,v) in textbox.attrs if k == "type" or k == "name")
  45. # abbreviated_attributes = {k : v for k in textbox.attrs} # 2.7 and above
  46. # Merge abbreviated attributes with textbox description
  47. textbox_description = dict(textbox_description.items() + abbreviated_attributes.items())
  48. form_data['fields'].append(textbox_description)
  49. #
  50. # Handle the textareas
  51. #
  52. textareas = form.findAll(['textarea'])
  53. for textarea in textareas:
  54. textarea_description = {}
  55. for index, label in enumerate(label_contents):
  56. if label_contents[index]['for'] == textarea['id']:
  57. textarea_description['label'] = label_contents[index]['contents'].strip()
  58. break
  59. abbreviated_attributes = dict((k,v) for (k,v) in textarea.attrs if k == "name")
  60. abbreviated_attributes['type'] = textarea.name
  61. textarea_description = dict(textarea_description.items() + abbreviated_attributes.items())
  62. form_data['fields'].append(textarea_description)
  63. """
  64. Ignore groups of checkboxes for now
  65. ####
  66. # Handle groups of checkboxes
  67. ####
  68. checkboxes = form.findAll(['input'], {'type': 'checkbox'})
  69. # Get your checkbox groups
  70. checkbox_groups = []
  71. for checkbox in checkboxes:
  72. if checkbox['name'] not in checkbox_groups:
  73. checkbox_groups.append(checkbox['name'])
  74. checkbox_questions = {}
  75. for group in checkbox_groups:
  76. checkbox_questions[group] = {'label': {}, 'options': []}
  77. for checkbox in checkboxes:
  78. for group in checkbox_groups:
  79. if checkbox['name'] == group:
  80. checkbox_questions[group]['options'].append({'attributes': dict(checkbox.attrs)})
  81. # Handle the label
  82. checkbox_name_pieces = checkbox['name'].split('.')
  83. checkbox_name_map = checkbox_name_pieces[0] + '_' + checkbox_name_pieces[1]
  84. for label in label_contents:
  85. if label['for'] == checkbox_name_map:
  86. checkbox_questions[group]['label'] = label
  87. page_data['form_contents'].append({'checkbox_groups': checkbox_questions})
  88. """
  89. return form_data
  90. def main(apibase, password, form_id, url, fields_callback=None):
  91. """
  92. """
  93. try:
  94. form_data = get_form_fields(url)
  95. except Exception, e:
  96. print >> sys.stderr, 'Failed because:', e
  97. fail_form(apibase, password, form_id)
  98. else:
  99. if fields_callback:
  100. fields_callback(form_data)
  101. finish_form(apibase, password, form_id, form_data['action'], form_data['method'], form_data['title'], form_data['fields'])
  102. if __name__ == '__main__':
  103. form_url = len(sys.argv) == 2 and sys.argv[1] or 'https://docs.google.com/spreadsheet/viewform?formkey=dFZsNVprWDY3REM3MnpjbW9rTGkzQUE6MQ'
  104. #get_form_fields(form_url)
  105. json.dump(get_form_fields(form_url), sys.stdout, indent=2)