When submitting a video to be processed, Hive’s backend splits the video into frames, runs the model on each frame, then recombines the results into an output JSON.

The video output for a classifier will have an object with time and classes for each frame, and a video output for a detector will have an object with time and bounding_poly for each frame.

{
  "output": [
    {
      "time": 0,
      "classes": [
        {
          "class": "general_not_nsfw_not_suggestive",
          "score": 0.7058501595243861
        },
        {
          "class": "general_nsfw",
          "score": 0.12669138033591393
        },
        {
          "class": "general_suggestive",
          "score": 0.16745846013969992
        }
      ],
      "media_link": "www.media_url.com/image1.jpg",
      "subtask_no": 366
    },
    {
      "time": 0.9676333333333333,
      "classes": [
        {
          "class": "general_not_nsfw_not_suggestive",
          "score": 0.9986390064645149
        },
        {
          "class": "general_nsfw",
          "score": 0.0008006852563126932
        },
        {
          "class": "general_suggestive",
          "score": 0.0005603082791726574
        }
      ],
      "media_link": "www.media_url.com/image2.jpg",
      "subtask_no": 367
    },
    {
      "time": 1.9686333333333335,
      "classes": [
        {
          "class": "general_not_nsfw_not_suggestive",
          "score": 0.9999421468597295
        },
        {
          "class": "general_nsfw",
          "score": 0.00003785532083901699
        },
        {
          "class": "general_suggestive",
          "score": 0.000019997819431634897
        }
      ],
      "subtask_no": 368
    }
  ]
}