/WebCrawler/mgstage.py

https://github.com/yoshiko2/AV_Data_Capture · Python · 129 lines · 121 code · 2 blank · 6 comment · 5 complexity · 5cbb3650a3a68442d58c6ab786e8efa7 MD5 · raw file

  1. import sys
  2. sys.path.append('../')
  3. import re
  4. from lxml import etree
  5. import json
  6. from bs4 import BeautifulSoup
  7. from ADC_function import *
  8. # import sys
  9. # import io
  10. # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
  11. def getTitle(a):
  12. try:
  13. html = etree.fromstring(a, etree.HTMLParser())
  14. result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']")
  15. return result.replace('/', ',')
  16. except:
  17. return ''
  18. def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
  19. html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
  20. result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
  21. result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
  22. return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
  23. def getStudio(a):
  24. html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
  25. result1=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
  26. result2=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
  27. return str(result1+result2).strip('+').replace("', '",'').replace('"','')
  28. def getRuntime(a):
  29. html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
  30. result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
  31. result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
  32. return str(result1 + result2).strip('+').rstrip('mi')
  33. def getLabel(a):
  34. html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
  35. result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
  36. '\\n')
  37. result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
  38. '\\n')
  39. return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
  40. def getNum(a):
  41. html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
  42. result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
  43. '\\n')
  44. result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
  45. '\\n')
  46. return str(result1 + result2).strip('+')
  47. def getYear(getRelease):
  48. try:
  49. result = str(re.search('\d{4}',getRelease).group())
  50. return result
  51. except:
  52. return getRelease
  53. def getRelease(a):
  54. html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
  55. result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
  56. '\\n')
  57. result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
  58. '\\n')
  59. return str(result1 + result2).strip('+').replace('/','-')
  60. def getTag(a):
  61. html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
  62. result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
  63. '\\n')
  64. result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
  65. '\\n')
  66. result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
  67. total = []
  68. for i in result:
  69. try:
  70. total.append(translateTag_to_sc(i))
  71. except:
  72. pass
  73. return total
  74. def getCover(htmlcode):
  75. html = etree.fromstring(htmlcode, etree.HTMLParser())
  76. result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
  77. # /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src
  78. return result
  79. def getDirector(a):
  80. html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
  81. result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
  82. '\\n')
  83. result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
  84. '\\n')
  85. return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
  86. def getOutline(htmlcode):
  87. html = etree.fromstring(htmlcode, etree.HTMLParser())
  88. result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
  89. return result
  90. def getSeries(a):
  91. html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
  92. result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
  93. '\\n')
  94. result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
  95. '\\n')
  96. return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
  97. def main(number2):
  98. number=number2.upper()
  99. htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
  100. soup = BeautifulSoup(htmlcode, 'lxml')
  101. a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
  102. b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
  103. #print(b)
  104. dic = {
  105. 'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''),
  106. 'studio': getStudio(a),
  107. 'outline': getOutline(b),
  108. 'runtime': getRuntime(a),
  109. 'director': getDirector(a),
  110. 'actor': getActor(a),
  111. 'release': getRelease(a),
  112. 'number': getNum(a),
  113. 'cover': getCover(htmlcode),
  114. 'imagecut': 0,
  115. 'tag': getTag(a),
  116. 'label':getLabel(a),
  117. 'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()),
  118. 'actor_photo': '',
  119. 'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/',
  120. 'source': 'mgstage.py',
  121. 'series': getSeries(a),
  122. }
  123. js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
  124. return js
  125. #print(htmlcode)
  126. if __name__ == '__main__':
  127. print(main('SIRO-4149'))