/4-crossreads/data/14-top-Topics-jsons.py

https://gitlab.com/jaumetet/crossreads
Python | 66 lines | 41 code | 9 blank | 16 comment | 9 complexity | 7303cdde566b0059242f890b2aa824cb MD5 | raw file
  1. import simplejson as json
  2. import glob
  3. from pprint import pprint
  4. #open diaries.json
  5. f = open("diaries.json", "r")
  6. diariesjson = json.loads(f.read())
  7. g = open("topics.json", "r")
  8. topicsjson = json.loads(g.read())
  9. # Set vars:
  10. TGlabels = ["", "per","war","mil","tra","tou"]
  11. TOPIC_THERSHOLD = 0.1;
  12. TOPLIST_NO = 1001;
  13. path0 = "11-diaryID-jsons/"
  14. topics_list = [1, 3, 5, 6, 7, 8, 9, 12, 14, 15, 17, 18, 19, 20, 23, 24, 25, 26, 27, 33, 34, 35, 36, 37, 38, 39, 40, 42, 45, 46, 49, 50, 53, 57, 58, 59, 60, 61, 62, 63, 65, 66, 68, 69, 70, 71, 72, 74, 75, 77, 78, 79, 80, 83, 84, 85, 87, 89, 90, 93, 94, 95, 96, 97] ## 64 topics out of 100. Finally 30 Labels.
  15. ## List of available diaries:
  16. #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  17. #!!!! MAIN LOOP NEEDS TO BE TOPICSJSON, AND THEN FOR EACH label, loop all diariesID.json and apply rules
  18. #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  19. totals = []
  20. for thetopic in topicsjson:
  21. top = [[100,"xx"]] ## fake first element. It'll be deleted
  22. TGid = int(str(thetopic["tid"])[0])
  23. print "----------- TGid: "+str(thetopic["mallet_ids_inluded"])
  24. for d in glob.glob(path0+'*'):
  25. #print d
  26. f = open(d, "r")
  27. mydiary = json.loads(f.read())
  28. p = 0
  29. for pages in mydiary[0]["pages"]:
  30. p+=1
  31. for topics in pages["topics"]:
  32. if float(topics[1]) >= TOPIC_THERSHOLD:
  33. ## get TG of this mallet_id, topics[0]
  34. if topics[0] in thetopic["mallet_ids_inluded"]: # and str(thetopic["tid"])[0] == str(TGid):
  35. ## Add conditions to add page to top list: number of element and score > that minimum.
  36. if float(topics[1]) > top[-1][0]:
  37. del top[-1]
  38. if len(top) < TOPLIST_NO:
  39. top.append([float(topics[1]), "/"+d[17:-5]+"/"+str(p)]);
  40. #print p,
  41. ## reorder list desc
  42. top = sorted(top,key=lambda el: (-el[0]));
  43. #####################################
  44. ## Order 1st for label, 2nd by score desc
  45. ## top = sorted(top,key=lambda el: (el[1],-el[0]));
  46. #####################################
  47. print len(top)
  48. totals.append([str(TGid)+'-'+TGlabels[TGid], str(thetopic["tid"])+'-'+thetopic["label"], len(top)])
  49. ## write top to file
  50. if len(top) >0:
  51. del top[0]
  52. f = open('14-tops/Topic-'+str(TGid)+'-'+TGlabels[TGid]+'--'+str(thetopic["tid"])+'-'+thetopic["label"].replace(" ","_")+'.json','w+')
  53. f.write(str(top))
  54. f.close()
  55. print '\t>>> Written --> 14-tops/Topic-'+str(TGid)+'-'+TGlabels[TGid]+'--'+str(thetopic["tid"])+'-'+thetopic["label"].replace(" ","_")+'.json'
  56. print "------"
  57. pprint(totals)