[FFmpeg-devel] [PATCH] Vulkan hwcontext and filters

Mark Thompson sw at jkqxz.net
Wed Jan 22 01:39:26 EET 2020


On 19/01/2020 13:11, Lynne wrote:
> Jan 18, 2020, 20:23 by sw at jkqxz.net:
> 
>> On 10/01/2020 21:05, Lynne wrote:
>>
>> The CUDA parts look like they could be split off into a separate commit?  (It's already huge.)
>>
> 
> I don't really want broken commits or commits with dead code.

Shouldn't be - you've got nice #ifdef markers surrounding exactly the right pieces to splice out :)

>>> @@ -3639,7 +3641,7 @@ avformat_deps="avcodec avutil"
>>>  avformat_suggest="libm network zlib"
>>>  avresample_deps="avutil"
>>>  avresample_suggest="libm"
>>> -avutil_suggest="clock_gettime ffnvcodec libm libdrm libmfx opencl user32 vaapi videotoolbox corefoundation corevideo coremedia bcrypt"
>>> +avutil_suggest="clock_gettime ffnvcodec libm libdrm libmfx opencl user32 vaapi vulkan videotoolbox corefoundation corevideo coremedia bcrypt"
>>>  postproc_deps="avutil gpl"
>>>  postproc_suggest="libm"
>>>  swresample_deps="avutil"
>>> @@ -6626,6 +6628,9 @@ enabled vdpau &&
>>>  
>>>  enabled crystalhd && check_lib crystalhd "stdint.h libcrystalhd/libcrystalhd_if.h" DtsCrystalHDVersion -lcrystalhd
>>>  
>>> +enabled vulkan &&
>>> +    require_pkg_config vulkan "vulkan >= 1.1.97" "vulkan/vulkan.h" vkCreateInstance
>>>
>>
>> Presumably you have some specific requirement in mind which wants this version - can you note it somewhere?  (Either here or in the commit message.)
>>
> 
> The DRM export/import and I think the semaphore import API.
> Its not a new version, its been out for a long time.

About a year, which probably isn't long enough to assume it's everywhere.  I guess it doesn't really matter, though, because it will only ever increase when something extra is added.

>>> ...
>>> +static int cuda_device_derive(AVHWDeviceContext *device_ctx,
>>> +                              AVHWDeviceContext *src_ctx,
>>> +                              int flags) {
>>> +    AVCUDADeviceContext *hwctx = device_ctx->hwctx;
>>> +    CudaFunctions *cu;
>>> +    const char *src_uuid = NULL;
>>> +    CUcontext dummy;
>>> +    int ret, i, device_count, dev_active = 0;
>>> +    unsigned int dev_flags = 0;
>>> +
>>> +    const unsigned int desired_flags = CU_CTX_SCHED_BLOCKING_SYNC;
>>> +
>>> +    switch (src_ctx->type) {
>>> +#if CONFIG_VULKAN
>>> +    case AV_HWDEVICE_TYPE_VULKAN: {
>>> +        AVVulkanDeviceContext *vkctx = src_ctx->hwctx;
>>> +        src_uuid = vkctx->device_uuid;
>>> +        break;
>>> +    }
>>> +#endif
>>> +    default:
>>> +        return AVERROR(ENOSYS);
>>> +    }
>>> +
>>> +    if (!src_uuid) {
>>> +        av_log(device_ctx, AV_LOG_ERROR,
>>> +               "Failed to get UUID of source device.\n");
>>> +        goto error;
>>> +    }
>>> +
>>> +    if (cuda_device_init(device_ctx) < 0)
>>> +        goto error;
>>> +
>>> +    cu = hwctx->internal->cuda_dl;
>>> +
>>> +    ret = CHECK_CU(cu->cuInit(0));
>>> +    if (ret < 0)
>>> +        goto error;
>>> +
>>> +    ret = CHECK_CU(cu->cuDeviceGetCount(&device_count));
>>> +    if (ret < 0)
>>> +        goto error;
>>> +
>>> +    hwctx->internal->cuda_device = -1;
>>> +    for (i = 0; i < device_count; i++) {
>>> +        CUdevice dev;
>>> +        CUuuid uuid;
>>> +
>>> +        ret = CHECK_CU(cu->cuDeviceGet(&dev, i));
>>> +        if (ret < 0)
>>> +            goto error;
>>> +
>>> +        ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev));
>>> +        if (ret < 0)
>>> +            goto error;
>>> +
>>> +        if (memcmp(src_uuid, uuid.bytes, sizeof (uuid.bytes)) == 0) {
>>> +            hwctx->internal->cuda_device = dev;
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    if (hwctx->internal->cuda_device == -1) {
>>> +        av_log(device_ctx, AV_LOG_ERROR, "Could not derive CUDA device.\n");
>>>
>>
>> This error is maybe more like "Can't find the matching CUDA device to the supplied Vulkan device".
>>
> 
> Let's keep it that way, its not meant to be vulkan specific, though its only used by vulkan currently.

Good point!  Fair enough.

>>> +        goto error;
>>> +    }
>>> +
>>> +    hwctx->internal->flags = flags;
>>> +
>>> +    if (flags & AV_CUDA_USE_PRIMARY_CONTEXT) {
>>> +        ret = CHECK_CU(cu->cuDevicePrimaryCtxGetState(hwctx->internal->cuda_device, &dev_flags, &dev_active));
>>> +        if (ret < 0)
>>> +            goto error;
>>> +
>>> +        if (dev_active && dev_flags != desired_flags) {
>>> +            av_log(device_ctx, AV_LOG_ERROR, "Primary context already active with incompatible flags.\n");
>>> +            goto error;
>>> +        } else if (dev_flags != desired_flags) {
>>> +            ret = CHECK_CU(cu->cuDevicePrimaryCtxSetFlags(hwctx->internal->cuda_device, desired_flags));
>>> +            if (ret < 0)
>>> +                goto error;
>>> +        }
>>> +
>>> +        ret = CHECK_CU(cu->cuDevicePrimaryCtxRetain(&hwctx->cuda_ctx, hwctx->internal->cuda_device));
>>> +        if (ret < 0)
>>> +            goto error;
>>> +    } else {
>>> +        ret = CHECK_CU(cu->cuCtxCreate(&hwctx->cuda_ctx, desired_flags, hwctx->internal->cuda_device));
>>> +        if (ret < 0)
>>> +            goto error;
>>> +
>>> +        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
>>> +    }
>>> +
>>> +    hwctx->internal->is_allocated = 1;
>>> +
>>> +    // Setting stream to NULL will make functions automatically use the default CUstream
>>> +    hwctx->stream = NULL;
>>> +
>>> +    return 0;
>>> +
>>> +error:
>>> +    cuda_device_uninit(device_ctx);
>>> +    return AVERROR_UNKNOWN;
>>> +}
>>> +
>>>  const HWContextType ff_hwcontext_type_cuda = {
>>>  .type                 = AV_HWDEVICE_TYPE_CUDA,
>>>  .name                 = "CUDA",
>>> @@ -397,6 +517,7 @@ const HWContextType ff_hwcontext_type_cuda = {
>>>  .frames_priv_size     = sizeof(CUDAFramesContext),
>>>  
>>>  .device_create        = cuda_device_create,
>>> +    .device_derive        = cuda_device_derive,
>>>  .device_init          = cuda_device_init,
>>>  .device_uninit        = cuda_device_uninit,
>>>  .frames_get_constraints = cuda_frames_get_constraints,
>>> ...
>>> diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
>>> new file mode 100644
>>> index 0000000000..d4eb8ffd35
>>> --- /dev/null
>>> +++ b/libavutil/hwcontext_vulkan.c
>>> @@ -0,0 +1,2804 @@
>>> ...
>>> +
>>> +static const struct {
>>> +    enum AVPixelFormat pixfmt;
>>> +    const VkFormat vkfmts[3];
>>> +} vk_pixfmt_map[] = {
>>> +    { AV_PIX_FMT_GRAY8,   { VK_FORMAT_R8_UNORM } },
>>> +    { AV_PIX_FMT_GRAY16,  { VK_FORMAT_R16_UNORM } },
>>> +    { AV_PIX_FMT_GRAYF32, { VK_FORMAT_R32_SFLOAT } },
>>> +
>>> +    { AV_PIX_FMT_NV12, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8G8_UNORM } },
>>> +    { AV_PIX_FMT_P010, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16G16_UNORM } },
>>>
>>
>> Is P010 still safe when the low bits might have any value?
>>
> 
> Our padding is in the top bits. Vulkan defines it in the bottom bits, hence we can't use it.

?  Our padding is definitely in the bottom bits, since it's defined to be consistent with all the hardware using it.

> I can't see why it wouldn't be safe. Every code that deals with user-supplied frames must rely they're junk.

Probably close enough for government work, but still slightly off.  If the only operation you can do is load it into a float as UNORM then a top value with zeroes would be slightly off-white and the colourspace-conversion nazis will hunt you down.

>>> +    { AV_PIX_FMT_P016, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16G16_UNORM } },
>>> +
>>> +    { AV_PIX_FMT_YUV420P, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM } },
>>> +    { AV_PIX_FMT_YUV422P, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM } },
>>> +    { AV_PIX_FMT_YUV444P, { VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM, VK_FORMAT_R8_UNORM } },
>>> +
>>> +    { AV_PIX_FMT_YUV420P16, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM } },
>>> +    { AV_PIX_FMT_YUV422P16, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM } },
>>> +    { AV_PIX_FMT_YUV444P16, { VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UNORM } },
>>> +
>>> +    { AV_PIX_FMT_ABGR,   { VK_FORMAT_A8B8G8R8_UNORM_PACK32 } },
>>> +    { AV_PIX_FMT_BGRA,   { VK_FORMAT_B8G8R8A8_UNORM } },
>>> +    { AV_PIX_FMT_RGBA,   { VK_FORMAT_R8G8B8A8_UNORM } },
>>> +    { AV_PIX_FMT_RGB24,  { VK_FORMAT_R8G8B8_UNORM } },
>>> +    { AV_PIX_FMT_BGR24,  { VK_FORMAT_B8G8R8_UNORM } },
>>> +    { AV_PIX_FMT_RGB48,  { VK_FORMAT_R16G16B16_UNORM } },
>>> +    { AV_PIX_FMT_RGBA64, { VK_FORMAT_R16G16B16A16_UNORM } },
>>> +    { AV_PIX_FMT_RGB565, { VK_FORMAT_R5G6B5_UNORM_PACK16 } },
>>> +    { AV_PIX_FMT_BGR565, { VK_FORMAT_B5G6R5_UNORM_PACK16 } },
>>> +    { AV_PIX_FMT_BGR0,   { VK_FORMAT_B8G8R8A8_UNORM } },
>>> +    { AV_PIX_FMT_0BGR,   { VK_FORMAT_A8B8G8R8_UNORM_PACK32 } },
>>> +    { AV_PIX_FMT_RGB0,   { VK_FORMAT_R8G8B8A8_UNORM } },
>>> +
>>> +    { AV_PIX_FMT_GBRPF32, { VK_FORMAT_R32_SFLOAT, VK_FORMAT_R32_SFLOAT, VK_FORMAT_R32_SFLOAT } },
>>> +};
>>> +
>>> ...
>>> +static int vulkan_device_create(AVHWDeviceContext *ctx, const char *device,
>>> +                                AVDictionary *opts, int flags)
>>> +{
>>> +    VulkanDeviceSelection dev_select = { 0 };
>>> +    if (device && device[0]) {
>>> +        char *end = NULL;
>>> +        dev_select.index = strtol(device, &end, 10);
>>> +        if (end == device) {
>>> +            dev_select.index = 0;
>>> +            dev_select.name  = device;
>>> +        }
>>> +    }
>>>
>>
>> Is it worth making uuid=f00 work here as well?  (From opts rather than device: "-init_hw_device vulkan:,uuid=f00".)
>>
> 
> Would like to, but I don't think its worth the extra dependency (libuuid, its widespread on linux but I'd rather not NIH parsing).

Ok.

>>> +
>>> +    return vulkan_device_create_internal(ctx, &dev_select, opts, flags);
>>> +}
>>> ...
>>> +
>>> +static int vulkan_frames_init(AVHWFramesContext *hwfc)
>>> +{
>>> +    int err;
>>> +    AVVkFrame *f;
>>> +    AVVulkanFramesContext *hwctx = hwfc->hwctx;
>>> +    VulkanFramesPriv *fp = hwfc->internal->priv;
>>> +    AVVulkanDeviceContext *dev_hwctx = hwfc->device_ctx->hwctx;
>>> +    VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
>>> +
>>> +    if (hwfc->pool)
>>> +        return 0;
>>> +
>>> +    /* Default pool flags */
>>> +    hwctx->tiling = hwctx->tiling ? hwctx->tiling : p->use_linear_images ?
>>> +                    VK_IMAGE_TILING_LINEAR : VK_IMAGE_TILING_OPTIMAL;
>>> +
>>> +    hwctx->usage |= DEFAULT_USAGE_FLAGS;
>>>
>>
>> Is it possible that this disallows some use-cases in a device where those default flags need not be not supported?  For example, some sort of magic image-writer like a video decoder where the output images can only ever be used as a source by non-magic operations.  Baking that into the API (as in the comment on usage in the header) seems bad if so.
>>
> 
> You need to support those flags to do anything useful with the images.
> This restricts read-only images since those don't support the STORAGE flag. Such images are the planar images, which DO support the STORAGE flag but only for their subimages (e.g. individual planes). This is in spec, regardless what Intel says and disagrees with (they advise to alias memory instead). We don't use them anyway, and if a user wanted to repackage received frames as planar (or unpackage them) they still can.

I feel like devices with weird memory are still a problem here, but reading the code again I can see that the user-provided pool actually trumps the concern anyway - if anything funny is going on then the user can set it up however they want.

>>> ...
>>> +
>>> +static int vulkan_map_from_drm_frame_desc(AVHWFramesContext *hwfc, AVVkFrame **frame,
>>> +                                          AVDRMFrameDescriptor *desc)
>>> +{
>>> +    int err = 0;
>>> +    VkResult ret;
>>> +    AVVkFrame *f;
>>> +    AVHWDeviceContext *ctx = hwfc->device_ctx;
>>> +    AVVulkanDeviceContext *hwctx = ctx->hwctx;
>>> +    VulkanDevicePriv *p = ctx->internal->priv;
>>> +    const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
>>> +    const AVPixFmtDescriptor *fmt_desc = av_pix_fmt_desc_get(hwfc->sw_format);
>>> +    const int has_modifiers = p->extensions & EXT_DRM_MODIFIER_FLAGS;
>>> +    VkSubresourceLayout plane_data[AV_NUM_DATA_POINTERS];
>>> +    VkBindImageMemoryInfo bind_info[AV_NUM_DATA_POINTERS];
>>> +    VkExternalMemoryHandleTypeFlagBits htype = VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
>>> +
>>> +    VK_LOAD_PFN(hwctx->inst, vkGetMemoryFdPropertiesKHR);
>>> +
>>> +    for (int i = 0; i < desc->nb_layers; i++) {
>>> +        if (desc->layers[i].nb_planes > 1) {
>>> +            av_log(ctx, AV_LOG_ERROR, "Cannot import DMABUFS with more than 1 "
>>> +                                      "plane per layer!\n");
>>> +            return AVERROR(EINVAL);
>>> +        }
>>> +
>>> +        if (drm_to_vulkan_fmt(desc->layers[i].format) == VK_FORMAT_UNDEFINED) {
>>> +            av_log(ctx, AV_LOG_ERROR, "Unsupported DMABUF layer format!\n");
>>>
>>
>> Maybe say what the unsupported format is for someone reporting the message, since this is probably relatively easy to hit (e.g. YUYV).
>>
> 
> Sure, I'll just use the handy DRM API/define to translate DRM FOURCC uint32_t to a string.
> Oh wait, I can't because there is no such API. I've had to manually go through every single define in the past, relying on blind luck and speculation to match those up. DRM devs want people to suffer, as if the big-endian confusion-inducing FOURCC isn't evidence enough for it.
> Not willing to NIH something to pull the chars out of it yet. Next time I have to look up a drm format id maybe.

Print it with %#08x.  I just think it should appear in the log so that it's possible for someone reading to log to see the format which failed; if they have to decode it a little bit themselves that isn't a disaster.

>>> +            return AVERROR(EINVAL);
>>> +        }
>>> +    }
>>> +
>>> +    if (!(f = av_mallocz(sizeof(*f)))) {
>>> +        av_log(ctx, AV_LOG_ERROR, "Unable to allocate memory for AVVkFrame!\n");
>>> +        err = AVERROR(ENOMEM);
>>> +        goto fail;
>>> +    }
>>> +
>>> ...
>>> +
>>> +static int vulkan_map_to_drm(AVHWFramesContext *hwfc, AVFrame *dst,
>>> +                             const AVFrame *src, int flags)
>>> +{
>>> +    int err = 0;
>>> +    VkResult ret;
>>> +    AVVkFrame *f = (AVVkFrame *)src->data[0];
>>> +    VulkanDevicePriv *p = hwfc->device_ctx->internal->priv;
>>> +    AVVulkanDeviceContext *hwctx = hwfc->device_ctx->hwctx;
>>> +    const int planes = av_pix_fmt_count_planes(hwfc->sw_format);
>>> +    VK_LOAD_PFN(hwctx->inst, vkGetMemoryFdKHR);
>>> +    VkImageDrmFormatModifierPropertiesEXT drm_mod = {
>>> +        .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT,
>>> +    };
>>>
>>
>> Do you need a sync here for any writing being finished, or is it implicit somehow below?
>>
> 
> There isn't. This would be where we export a semaphore to VAAPI.
> We could make a CPU sync by converting the image to read optimal and waiting on the command queue's semaphore, but that would need another exec context.
> Should we?
> I don't have a way to test this ATM, my test program is gone, so if I do touch it I'll need to write a new one, which is a lot of work. With ffmpeg I can only test raw exporting without it actually being used by anything, since everything complains about some missing contexts IIRC.

You often get away with this because some part of the next operation ends up queued on the same hardware submission ring.

I guess leave it unless you want to add the extra sync, but add a comment for the next person reading it.

>>> +
>>> +    AVDRMFrameDescriptor *drm_desc = av_mallocz(sizeof(*drm_desc));
>>> +    if (!drm_desc)
>>> +        return AVERROR(ENOMEM);
>>> +
>>> +    err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src, &vulkan_unmap_to_drm, drm_desc);
>>> +    if (err < 0)
>>> +        goto end;
>>> +
>>> +    if (p->extensions & EXT_DRM_MODIFIER_FLAGS) {
>>> +        VK_LOAD_PFN(hwctx->inst, vkGetImageDrmFormatModifierPropertiesEXT);
>>> +        ret = pfn_vkGetImageDrmFormatModifierPropertiesEXT(hwctx->act_dev, f->img[0],
>>> +                                                           &drm_mod);
>>> +        if (ret != VK_SUCCESS) {
>>> +            av_log(hwfc, AV_LOG_ERROR, "Failed to retrieve DRM format modifier!\n");
>>> +            err = AVERROR_EXTERNAL;
>>> +            goto end;
>>> +        }
>>> +    }
>>> +
>>> ...
>>> +
>>> +static int vulkan_transfer_data_to_mem(AVHWFramesContext *hwfc, AVFrame *dst,
>>> +                                       const AVFrame *src)
>>> +{
>>> +    int err = 0;
>>> +    AVFrame tmp;
>>> +    AVVkFrame *f = (AVVkFrame *)src->data[0];
>>> +    AVHWDeviceContext *dev_ctx = hwfc->device_ctx;
>>> +    ImageBuffer buf[AV_NUM_DATA_POINTERS] = { { 0 } };
>>> +    const int planes = av_pix_fmt_count_planes(dst->format);
>>> +    int log2_chroma = av_pix_fmt_desc_get(dst->format)->log2_chroma_h;
>>> +
>>> +    if (dst->width > hwfc->width || dst->height > hwfc->height)
>>> +        return AVERROR(EINVAL);
>>> +
>>> +    /* For linear, host visiable images */
>>> +    if (f->tiling == VK_IMAGE_TILING_LINEAR &&
>>> +        f->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
>>>
>>
>> Is it generally expected that this is actually faster than the next option?  (Because evil uncached memory is a thing.)
>>
> 
> Could be. On Intel it was faster, but now its as fast as tiled images. On NVIDIA its slower. Its still faster on Intel for simple short filter chains. Keeping it for cheap devices.

Yep, fair enough.

>>> +        AVFrame *map = av_frame_alloc();
>>> +        if (!map)
>>> +            return AVERROR(ENOMEM);
>>> +        map->format = dst->format;
>>> +
>>> +        err = vulkan_map_frame_to_mem(hwfc, map, src, AV_HWFRAME_MAP_READ);
>>> +        if (err)
>>> +            return err;
>>> +
>>> +        err = av_frame_copy(dst, map);
>>> +        av_frame_free(&map);
>>> +        return err;
>>> +    }
>>> ...
Thanks,

- Mark


More information about the ffmpeg-devel mailing list