From eaaa34daef00c11a2b0a8f79cc1c948336b81b5f Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Thu, 9 Jun 2022 23:17:32 -0700
Subject: [PATCH] [ci] write test suites to rockset

Currently we upload all `testcase` elements as individual test runs to
Rockset. It would be nice to also have `testsuite`s as well, which
aggregate high level information.

These aggregations could technically be performed in the backend, but it's
faster to just log the data since we already have it in the XML test
report.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/79265

Approved by: https://github.com/seemethere
---
 tools/stats/upload_test_stats.py     | 54 +++++++++++++++++-----------
 tools/test/test_upload_test_stats.py |  5 +--
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/tools/stats/upload_test_stats.py b/tools/stats/upload_test_stats.py
index a3efb408240..9efa60fd0f8 100644
--- a/tools/stats/upload_test_stats.py
+++ b/tools/stats/upload_test_stats.py
@@ -4,7 +4,7 @@ import requests
 import zipfile
 import xml.etree.ElementTree as ET
 from pathlib import Path
-from typing import Dict, List, Any
+from typing import Dict, List, Any, Tuple
 from tempfile import TemporaryDirectory
 
 import rockset  # type: ignore[import]
@@ -22,10 +22,10 @@ def get_request_headers() -> Dict[str, str]:
 
 
 def parse_xml_report(
-    report: Path, workflow_id: int, workflow_run_attempt: int
+    tag: str, report: Path, workflow_id: int, workflow_run_attempt: int
 ) -> List[Dict[str, Any]]:
     """Convert a test report xml file into a JSON-serializable list of test cases."""
-    print(f"Parsing test report: {report}")
+    print(f"Parsing {tag}s for test report: {report}")
     # [Job id in artifacts]
     # Retrieve the job id from the report path. In our GHA workflows, we append
     # the job id to the end of the report name, so `report` looks like:
@@ -37,7 +37,7 @@ def parse_xml_report(
     root = ET.parse(report)
 
     test_cases = []
-    for test_case in root.iter("testcase"):
+    for test_case in root.iter(tag):
         case = process_xml_element(test_case)
         case["workflow_id"] = workflow_id
         case["workflow_run_attempt"] = workflow_run_attempt
@@ -58,14 +58,17 @@ def process_xml_element(element: ET.Element) -> Dict[str, Any]:
     #     {"name": "test_foo", "classname": "test_bar"}
     ret.update(element.attrib)
 
-    # By default, all attributes are strings. Apply a few special conversions
-    # here for well-known attributes so that they are the right type in Rockset.
-    line = ret.get("line")
-    if line:
-        ret["line"] = int(line)
-    time = ret.get("time")
-    if time:
-        ret["time"] = float(time)
+    # The XML format encodes all values as strings. Convert to ints/floats if
+    # possible to make aggregation possible in Rockset.
+    for k, v in ret.items():
+        try:
+            ret[k] = int(v)
+        except ValueError:
+            pass
+        try:
+            ret[k] = float(v)
+        except ValueError:
+            pass
 
     # Convert inner and outer text into special dict elements.
     # e.g.
@@ -181,18 +184,18 @@ def download_and_extract_gha_artifacts(
         download_and_extract_artifact(Path(name), url, workflow_run_attempt)
 
 
-def upload_to_rockset(test_cases: List[Any]) -> None:
-    print(f"Writing {len(test_cases)} test cases to Rockset")
+def upload_to_rockset(collection: str, docs: List[Any]) -> None:
+    print(f"Writing {len(docs)} documents to Rockset")
     client = rockset.Client(
         api_server="api.rs2.usw2.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
     )
-    client.Collection.retrieve("test_run").add_docs(test_cases)
+    client.Collection.retrieve(collection).add_docs(docs)
     print("Done!")
 
 
-def get_test_cases(
+def get_tests(
     workflow_run_id: int, workflow_run_attempt: int
-) -> List[Dict[str, Any]]:
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
     with TemporaryDirectory() as temp_dir:
         print("Using temporary directory:", temp_dir)
         os.chdir(temp_dir)
@@ -203,16 +206,26 @@ def get_test_cases(
 
         # Parse the reports and transform them to JSON
         test_cases = []
+        test_suites = []
         for xml_report in Path(".").glob("**/*.xml"):
             test_cases.extend(
                 parse_xml_report(
+                    "testcase",
+                    xml_report,
+                    workflow_run_id,
+                    workflow_run_attempt,
+                )
+            )
+            test_suites.extend(
+                parse_xml_report(
+                    "testsuite",
                     xml_report,
                     workflow_run_id,
                     workflow_run_attempt,
                 )
             )
 
-        return test_cases
+        return test_cases, test_suites
 
 
 if __name__ == "__main__":
@@ -230,5 +243,6 @@ if __name__ == "__main__":
         help="which retry of the workflow this is",
     )
     args = parser.parse_args()
-    test_cases = get_test_cases(args.workflow_run_id, args.workflow_run_attempt)
-    upload_to_rockset(test_cases)
+    test_cases, test_suites = get_tests(args.workflow_run_id, args.workflow_run_attempt)
+    upload_to_rockset("test_run", test_cases)
+    upload_to_rockset("test_suite", test_suites)
diff --git a/tools/test/test_upload_test_stats.py b/tools/test/test_upload_test_stats.py
index 88972dec138..71b5c4c6c05 100644
--- a/tools/test/test_upload_test_stats.py
+++ b/tools/test/test_upload_test_stats.py
@@ -3,7 +3,7 @@ import os
 
 IN_CI = os.environ.get("CI")
 
-from tools.stats.upload_test_stats import get_test_cases
+from tools.stats.upload_test_stats import get_tests
 
 
 class TestUploadTestStats(unittest.TestCase):
@@ -13,8 +13,9 @@ class TestUploadTestStats(unittest.TestCase):
     )
     def test_existing_job(self) -> None:
         """Run on a known-good job and make sure we don't error and get basically okay reults."""
-        test_cases = get_test_cases(2465214458, 1)
+        test_cases, test_suites = get_tests(2465214458, 1)
         self.assertEqual(len(test_cases), 731457)
+        self.assertEqual(len(test_suites), 7781)
 
 
 if __name__ == "__main__":