/src/dev-utils/check_gpu_count.py

https://github.com/Microsoft/DLWorkspace · Python · 192 lines · 159 code · 30 blank · 3 comment · 31 complexity · 9a6cb89817175bc5fdf8afd780484fe9 MD5 · raw file

  1. #!/usr/bin/env python3
  2. import urllib.parse
  3. import argparse
  4. import logging
  5. import datetime
  6. import pprint
  7. import sys
  8. import requests
  9. logger = logging.getLogger(__name__)
  10. def walk_json_field_safe(obj, *fields):
  11. """ for example a=[{"a": {"b": 2}}]
  12. walk_json_field_safe(a, 0, "a", "b") will get 2
  13. walk_json_field_safe(a, 0, "not_exist") will get None
  14. """
  15. try:
  16. for f in fields:
  17. obj = obj[f]
  18. return obj
  19. except:
  20. return None
  21. # capacity, available, unschedulable, used
  22. def get_restful_data(args):
  23. result = {}
  24. node_result = {}
  25. for vc_name in args.vc.split(","):
  26. query = urllib.parse.urlencode({
  27. "vcName": vc_name,
  28. "userName": str(args.alias + "@microsoft.com"),
  29. })
  30. url = urllib.parse.urljoin(args.rest_url, "/GetVC") + "?" + query
  31. body = requests.get(url).json()
  32. result[vc_name] = {
  33. "total":
  34. body["gpu_capacity"].get("P40")
  35. or body["gpu_capacity"].get("V100"),
  36. "available":
  37. body["gpu_avaliable"].get("P40")
  38. or body["gpu_avaliable"].get("V100"),
  39. "unschedulable":
  40. body["gpu_unschedulable"].get("P40")
  41. or body["gpu_unschedulable"].get("V100"),
  42. }
  43. if len(node_result) == 0:
  44. for node in body["node_status"]:
  45. node_result[node["InternalIP"]] = {}
  46. node_result[node["InternalIP"]]["total"] = walk_json_field_safe(
  47. node, "gpu_capacity", "P40") or walk_json_field_safe(
  48. node, "gpu_capacity", "V100") or 0
  49. if node["unschedulable"]:
  50. node_result[node["InternalIP"]]["allocatable"] = 0
  51. else:
  52. node_result[node["InternalIP"]][
  53. "allocatable"] = walk_json_field_safe(
  54. node, "gpu_allocatable",
  55. "P40") or walk_json_field_safe(
  56. node, "gpu_allocatable", "V100") or 0
  57. node_result[node["InternalIP"]]["used"] = walk_json_field_safe(
  58. node, "gpu_used", "P40") or walk_json_field_safe(
  59. node, "gpu_used", "V100") or 0
  60. node_result[node["InternalIP"]][
  61. "preemtable_used"] = walk_json_field_safe(
  62. node,
  63. "gpu_preemptable_used", "P40") or walk_json_field_safe(
  64. node, "gpu_preemptable_used", "V100") or 0
  65. return result, node_result
  66. def get_prometheus_data(args):
  67. queries = [
  68. "k8s_vc_gpu_total",
  69. "k8s_vc_gpu_available",
  70. "k8s_vc_gpu_unschedulable",
  71. ]
  72. result = {}
  73. for query in queries:
  74. params = urllib.parse.urlencode({"query": query})
  75. url = urllib.parse.urljoin(args.prometheus_url,
  76. "/prometheus/api/v1/query") + "?" + params
  77. body = requests.get(url).json()
  78. for metric in body["data"]["result"]:
  79. vc_name = metric["metric"]["vc_name"]
  80. if vc_name not in result:
  81. result[vc_name] = {}
  82. result[vc_name][query] = int(metric["value"][1])
  83. for vc_name, m in result.items():
  84. m["total"] = m.pop("k8s_vc_gpu_total")
  85. m["available"] = m.pop("k8s_vc_gpu_available")
  86. m["unschedulable"] = m.pop("k8s_vc_gpu_unschedulable")
  87. node_queries = [
  88. "k8s_node_gpu_total",
  89. "k8s_node_gpu_allocatable",
  90. "k8s_node_gpu_available",
  91. "k8s_node_preemptable_gpu_available",
  92. ]
  93. node_result = {}
  94. for query in node_queries:
  95. params = urllib.parse.urlencode({"query": query})
  96. url = urllib.parse.urljoin(args.prometheus_url,
  97. "/prometheus/api/v1/query") + "?" + params
  98. body = requests.get(url).json()
  99. for metric in body["data"]["result"]:
  100. ip = metric["metric"]["host_ip"]
  101. if ip not in node_result:
  102. node_result[ip] = {}
  103. node_result[ip][query] = int(metric["value"][1])
  104. for ip, m in node_result.items():
  105. # total, allocatable, used, preemtable_used,
  106. m["total"] = m["k8s_node_gpu_total"]
  107. m["allocatable"] = m["k8s_node_gpu_allocatable"]
  108. m["used"] = m["k8s_node_gpu_allocatable"] - m["k8s_node_gpu_available"]
  109. m["preemtable_used"] = m["k8s_node_gpu_available"] - m[
  110. "k8s_node_preemptable_gpu_available"]
  111. m.pop("k8s_node_gpu_total")
  112. m.pop("k8s_node_gpu_allocatable")
  113. m.pop("k8s_node_gpu_available")
  114. m.pop("k8s_node_preemptable_gpu_available")
  115. return result, node_result
  116. def main(args):
  117. pp = pprint.PrettyPrinter()
  118. result, node_result = get_restful_data(args)
  119. presult, p_node_result = get_prometheus_data(args)
  120. has_diff = False
  121. if result != presult:
  122. pp.pprint(result)
  123. pp.pprint(presult)
  124. print("-" * 80)
  125. has_diff = True
  126. for ip, rest_info in node_result.items():
  127. p_info = p_node_result[ip]
  128. if rest_info != p_info:
  129. print(ip)
  130. print("restful result")
  131. pp.pprint(rest_info)
  132. print("prometheus result")
  133. pp.pprint(p_info)
  134. print("-" * 40)
  135. has_diff = True
  136. if has_diff:
  137. sys.exit(1)
  138. if __name__ == "__main__":
  139. logging.basicConfig(
  140. format=
  141. "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s",
  142. level=logging.INFO)
  143. parser = argparse.ArgumentParser(
  144. description="Check gpu count from prometheus and restfulapi")
  145. parser.add_argument("--prometheus_url",
  146. "-p",
  147. default="http://127.0.0.1:9091",
  148. help="Prometheus url, eg: http://127.0.0.1:9091")
  149. parser.add_argument("--rest_url",
  150. "-r",
  151. default="http://127.0.0.1:5006",
  152. help="Restfulapi url, eg: http://127.0.0.1:9091")
  153. parser.add_argument("--alias",
  154. "-a",
  155. default="dixu",
  156. help="alias to query restfulapi, eg: dixu")
  157. parser.add_argument("--vc",
  158. "-l",
  159. default="quantus,relevance2,relevance2inf",
  160. help="vc list to query, comma separated")
  161. args = parser.parse_args()
  162. main(args)