[FFmpeg-devel] [PATCH 2/3] libavformat/hls: add support for SAMPLE-AES decryption in HLS demuxer

Fri Jan 22 20:05:47 EET 2021

Nachiket Tarate:
> Apple HTTP Live Streaming Sample Encryption:
> 
> https://developer.apple.com/library/ios/documentation/AudioVideo/Conceptual/HLS_Sample_Encryption
> 
> Signed-off-by: Nachiket Tarate <nachiket.programmer at gmail.com>
> ---
>  libavformat/Makefile         |   2 +-
>  libavformat/hls.c            |  97 ++++++-
>  libavformat/hls_sample_aes.c | 486 +++++++++++++++++++++++++++++++++++
>  libavformat/hls_sample_aes.h |  64 +++++
>  libavformat/mpegts.c         |  12 +
>  5 files changed, 647 insertions(+), 14 deletions(-)
>  create mode 100644 libavformat/hls_sample_aes.c
>  create mode 100644 libavformat/hls_sample_aes.h
> 
> diff --git a/libavformat/Makefile b/libavformat/Makefile
> index 3a8fbcbe5f..c97930d98b 100644
> --- a/libavformat/Makefile
> +++ b/libavformat/Makefile
> @@ -237,7 +237,7 @@ OBJS-$(CONFIG_HCOM_DEMUXER)              += hcom.o pcm.o
>  OBJS-$(CONFIG_HDS_MUXER)                 += hdsenc.o
>  OBJS-$(CONFIG_HEVC_DEMUXER)              += hevcdec.o rawdec.o
>  OBJS-$(CONFIG_HEVC_MUXER)                += rawenc.o
> -OBJS-$(CONFIG_HLS_DEMUXER)               += hls.o
> +OBJS-$(CONFIG_HLS_DEMUXER)               += hls.o hls_sample_aes.o
>  OBJS-$(CONFIG_HLS_MUXER)                 += hlsenc.o hlsplaylist.o avc.o
>  OBJS-$(CONFIG_HNM_DEMUXER)               += hnm.o
>  OBJS-$(CONFIG_ICO_DEMUXER)               += icodec.o
> diff --git a/libavformat/hls.c b/libavformat/hls.c
> index 619e4800de..9e7f020cea 100644
> --- a/libavformat/hls.c
> +++ b/libavformat/hls.c
> @@ -2,6 +2,7 @@
>   * Apple HTTP Live Streaming demuxer
>   * Copyright (c) 2010 Martin Storsjo
>   * Copyright (c) 2013 Anssi Hannula
> + * Copyright (c) 2021 Nachiket Tarate
>   *
>   * This file is part of FFmpeg.
>   *
> @@ -39,6 +40,8 @@
>  #include "avio_internal.h"
>  #include "id3v2.h"
>  
> +#include "hls_sample_aes.h"
> +
>  #define INITIAL_BUFFER_SIZE 32768
>  
>  #define MAX_FIELD_LEN 64
> @@ -145,6 +148,8 @@ struct playlist {
>      int id3_changed; /* ID3 tag data has changed at some point */
>      ID3v2ExtraMeta *id3_deferred_extra; /* stored here until subdemuxer is opened */
>  
> +    HLSAudioSetupInfo audio_setup_info;
> +
>      int64_t seek_timestamp;
>      int seek_flags;
>      int seek_stream_index; /* into subdemuxer stream array */
> @@ -986,7 +991,10 @@ fail:
>  
>  static struct segment *current_segment(struct playlist *pls)
>  {
> -    return pls->segments[pls->cur_seq_no - pls->start_seq_no];
> +    int n = pls->cur_seq_no - pls->start_seq_no;
> +    if (n >= pls->n_segments)
> +        return NULL;
> +    return pls->segments[n];
>  }
>  
>  static struct segment *next_segment(struct playlist *pls)
> @@ -1015,10 +1023,11 @@ static int read_from_url(struct playlist *pls, struct segment *seg,
>  
>  /* Parse the raw ID3 data and pass contents to caller */
>  static void parse_id3(AVFormatContext *s, AVIOContext *pb,
> -                      AVDictionary **metadata, int64_t *dts,
> +                      AVDictionary **metadata, int64_t *dts, HLSAudioSetupInfo *audio_setup_info,
>                        ID3v2ExtraMetaAPIC **apic, ID3v2ExtraMeta **extra_meta)
>  {
>      static const char id3_priv_owner_ts[] = "com.apple.streaming.transportStreamTimestamp";
> +    static const char id3_priv_owner_audio_setup[] = "com.apple.streaming.audioDescription";
>      ID3v2ExtraMeta *meta;
>  
>      ff_id3v2_read_dict(pb, metadata, ID3v2_DEFAULT_MAGIC, extra_meta);
> @@ -1034,6 +1043,9 @@ static void parse_id3(AVFormatContext *s, AVIOContext *pb,
>                  else
>                      av_log(s, AV_LOG_ERROR, "Invalid HLS ID3 audio timestamp %"PRId64"\n", ts);
>              }
> +            else if (priv->datasize >= 8 && !strcmp(priv->owner, id3_priv_owner_audio_setup)) {
> +                ff_hls_read_audio_setup_info(audio_setup_info, priv->data, priv->datasize);
> +            }
>          } else if (!strcmp(meta->tag, "APIC") && apic)
>              *apic = &meta->data.apic;
>      }
> @@ -1076,7 +1088,7 @@ static void handle_id3(AVIOContext *pb, struct playlist *pls)
>      ID3v2ExtraMeta *extra_meta = NULL;
>      int64_t timestamp = AV_NOPTS_VALUE;
>  
> -    parse_id3(pls->ctx, pb, &metadata, &timestamp, &apic, &extra_meta);
> +    parse_id3(pls->ctx, pb, &metadata, &timestamp, &pls->audio_setup_info, &apic, &extra_meta);
>  
>      if (timestamp != AV_NOPTS_VALUE) {
>          pls->id3_mpegts_timestamp = timestamp;
> @@ -1230,10 +1242,7 @@ static int open_input(HLSContext *c, struct playlist *pls, struct segment *seg,
>      av_log(pls->parent, AV_LOG_VERBOSE, "HLS request for url '%s', offset %"PRId64", playlist %d\n",
>             seg->url, seg->url_offset, pls->index);
>  
> -    if (seg->key_type == KEY_NONE) {
> -        ret = open_url(pls->parent, in, seg->url, &c->avio_opts, opts, &is_http);
> -    } else if (seg->key_type == KEY_AES_128) {
> -        char iv[33], key[33], url[MAX_URL_SIZE];
> +    if (seg->key_type == KEY_AES_128 || seg->key_type == KEY_SAMPLE_AES) {
>          if (strcmp(seg->key, pls->key_url)) {
>              AVIOContext *pb = NULL;
>              if (open_url(pls->parent, &pb, seg->key, &c->avio_opts, opts, NULL) == 0) {
> @@ -1249,6 +1258,10 @@ static int open_input(HLSContext *c, struct playlist *pls, struct segment *seg,
>              }
>              av_strlcpy(pls->key_url, seg->key, sizeof(pls->key_url));
>          }
> +    }
> +
> +    if (seg->key_type == KEY_AES_128) {
> +        char iv[33], key[33], url[MAX_URL_SIZE];
>          ff_data_to_hex(iv, seg->iv, sizeof(seg->iv), 0);
>          ff_data_to_hex(key, pls->key, sizeof(pls->key), 0);
>          iv[32] = key[32] = '\0';
> @@ -1265,13 +1278,9 @@ static int open_input(HLSContext *c, struct playlist *pls, struct segment *seg,
>              goto cleanup;
>          }
>          ret = 0;
> -    } else if (seg->key_type == KEY_SAMPLE_AES) {
> -        av_log(pls->parent, AV_LOG_ERROR,
> -               "SAMPLE-AES encryption is not supported yet\n");
> -        ret = AVERROR_PATCHWELCOME;
> +    } else {
> +        ret = open_url(pls->parent, in, seg->url, &c->avio_opts, opts, &is_http);
>      }
> -    else
> -      ret = AVERROR(ENOSYS);
>  
>      /* Seek to the requested position. If this was a HTTP request, the offset
>       * should already be where want it to, but this allows e.g. local testing
> @@ -1940,6 +1949,7 @@ static int hls_read_header(AVFormatContext *s)
>          struct playlist *pls = c->playlists[i];
>          char *url;
>          ff_const59 AVInputFormat *in_fmt = NULL;
> +        struct segment *seg = NULL;
>  
>          if (!(pls->ctx = avformat_alloc_context())) {
>              ret = AVERROR(ENOMEM);
> @@ -1972,8 +1982,52 @@ static int hls_read_header(AVFormatContext *s)
>              pls->ctx = NULL;
>              goto fail;
>          }
> +
>          ffio_init_context(&pls->pb, pls->read_buffer, INITIAL_BUFFER_SIZE, 0, pls,
>                            read_data, NULL, NULL);
> +
> +        /*
> +         * If encryption scheme is SAMPLE-AES, try to read  ID3 tags of
> +         * external audio track that contains audio setup information
> +         */
> +        seg = current_segment(pls);
> +        if (seg && seg->key_type == KEY_SAMPLE_AES && pls->n_renditions > 0 &&
> +            pls->renditions[0]->type == AVMEDIA_TYPE_AUDIO) {
> +
> +            uint8_t *buf = av_malloc(HLS_MAX_ID3_TAGS_DATA_LEN);

Such small buffers are best put on the stack: It saves checks and
cleanup code.

> +            if (!buf) {
> +                ret = AVERROR(ENOMEM);
> +                avformat_free_context(pls->ctx);
> +                pls->ctx = NULL;
> +                goto fail;
> +            }
> +
> +            if ((ret = avio_read(&pls->pb, buf, HLS_MAX_ID3_TAGS_DATA_LEN)) < 0) {
> +                /* Fail if error was not end of file */
> +                if (ret != AVERROR_EOF) {
> +                    av_free(buf);
> +                    avformat_free_context(pls->ctx);
> +                    pls->ctx = NULL;
> +                    goto fail;
> +                }
> +                ret   = 0;          /* error was end of file, nothing read */
> +            }
> +
> +            av_free(buf);
> +        }
> +
> +        /*
> +         * If encryption scheme is SAMPLE-AES and audio setup information is present in external audio track,
> +         * use that information to find the media format, otherwise probe input data
> +         */
> +        if (seg->key_type == KEY_SAMPLE_AES && pls->is_id3_timestamped == 1 &&
> +            pls->audio_setup_info.codec_id != AV_CODEC_ID_NONE) {
> +            void *i = 0;

= NULL;

> +            while ((in_fmt = (ff_const59 AVInputFormat *)av_demuxer_iterate(&i)))
> +                if (in_fmt->raw_codec_id == pls->audio_setup_info.codec_id) {
> +                    break;
> +                }
> +        } else {
>          pls->ctx->probesize = s->probesize > 0 ? s->probesize : 1024 * 4;
>          pls->ctx->max_analyze_duration = s->max_analyze_duration > 0 ? s->max_analyze_duration : 4 * AV_TIME_BASE;
>          pls->ctx->interrupt_callback = s->interrupt_callback;
> @@ -1991,6 +2045,8 @@ static int hls_read_header(AVFormatContext *s)
>              goto fail;
>          }
>          av_free(url);
> +        }
> +
>          pls->ctx->pb       = &pls->pb;
>          pls->ctx->io_open  = nested_io_open;
>          pls->ctx->flags   |= s->flags & ~AVFMT_FLAG_CUSTOM_IO;
> @@ -2019,7 +2075,12 @@ static int hls_read_header(AVFormatContext *s)
>           * on us if they want to.
>           */
>          if (pls->is_id3_timestamped || (pls->n_renditions > 0 && pls->renditions[0]->type == AVMEDIA_TYPE_AUDIO)) {
> +            if (seg && seg->key_type == KEY_SAMPLE_AES && pls->audio_setup_info.setup_data_length > 0 &&
> +                pls->ctx->nb_streams == 1) {
> +                ret = ff_hls_parse_audio_setup_info(pls->ctx->streams[0], &pls->audio_setup_info);
> +            } else {
>              ret = avformat_find_stream_info(pls->ctx, NULL);
> +            }
>              if (ret < 0)
>                  goto fail;
>          }
> @@ -2149,6 +2210,7 @@ static int hls_read_packet(AVFormatContext *s, AVPacket *pkt)
>              while (1) {
>                  int64_t ts_diff;
>                  AVRational tb;
> +                struct segment *seg = NULL;
>                  ret = av_read_frame(pls->ctx, &pls->pkt);
>                  if (ret < 0) {
>                      if (!avio_feof(&pls->pb) && ret != AVERROR_EOF)
> @@ -2167,6 +2229,15 @@ static int hls_read_packet(AVFormatContext *s, AVPacket *pkt)
>                              get_timebase(pls), AV_TIME_BASE_Q);
>                  }
>  
> +                seg = current_segment(pls);
> +                if (seg && seg->key_type == KEY_SAMPLE_AES) {
> +                    HLSCryptoContext crypto_ctx;
> +                    enum AVCodecID codec_id = pls->ctx->streams[pls->pkt.stream_index]->codecpar->codec_id;
> +                    memcpy(crypto_ctx.iv, seg->iv, sizeof(seg->iv));
> +                    memcpy(crypto_ctx.key, pls->key, sizeof(pls->key));
> +                    ff_hls_decrypt_frame(codec_id, &crypto_ctx, &pls->pkt);
> +                }
> +
>                  if (pls->seek_timestamp == AV_NOPTS_VALUE)
>                      break;
>  
> diff --git a/libavformat/hls_sample_aes.c b/libavformat/hls_sample_aes.c
> new file mode 100644
> index 0000000000..0fb20b8613
> --- /dev/null
> +++ b/libavformat/hls_sample_aes.c
> @@ -0,0 +1,486 @@
> +/*
> + * Apple HTTP Live Streaming Sample Encryption/Decryption
> + *
> + * Copyright (c) 2021 Nachiket Tarate
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * Apple HTTP Live Streaming Sample Encryption
> + * https://developer.apple.com/library/ios/documentation/AudioVideo/Conceptual/HLS_Sample_Encryption
> + */
> +
> +#include "hls_sample_aes.h"
> +
> +#include "libavcodec/adts_header.h"
> +#include "libavcodec/adts_parser.h"
> +#include "libavcodec/ac3_parser_internal.h"
> +#include "libavutil/aes.h"
> +
> +
> +typedef struct NALUnit {
> +    uint8_t     *data;
> +    int         type;
> +    int         length;
> +} NALUnit;
> +
> +typedef struct AudioFrame {
> +    uint8_t     *data;
> +    int         length;
> +    int         header_length;
> +} AudioFrame;
> +
> +typedef struct CodecParserContext {
> +    const uint8_t   *buf_in;
> +    const uint8_t   *buf_end;
> +    uint8_t         *buf_out;
> +    int             next_start_code_length;
> +} CodecParserContext;
> +
> +static const int eac3_sample_rate_tab[] = { 48000, 44100, 32000, 0 };
> +
> +void ff_hls_read_audio_setup_info(HLSAudioSetupInfo *info, const uint8_t *buf, size_t size)
> +{
> +    info->codec_tag 		 = AV_RL32(buf);
> +
> +    if (!strncmp((const char*)&info->codec_tag, "zaac", 4))

These checks are endian-dependent. Use if (info->codec_tag == MKTAG('z',
'a', 'a', 'c')) etc. (Given that this format is actually BE I wonder
whether one should not read codec_tag via AV_RB32(buf); in this case,
you need to use MKBETAG instead of MKTAG.)

> +        info->codec_id = AV_CODEC_ID_AAC;
> +    else if (!strncmp((const char*)&info->codec_tag, "zac3", 4))
> +        info->codec_id = AV_CODEC_ID_AC3;
> +    else if (!strncmp((const char*)&info->codec_tag, "zec3", 4))
> +        info->codec_id = AV_CODEC_ID_EAC3;
> +    else
> +        info->codec_id = AV_CODEC_ID_NONE;
> +
> +    buf += 4;
> +    info->priming               = AV_RL16(buf);
> +    buf += 2;
> +    info->version               = *buf++;
> +    info->setup_data_length     = *buf++;
> +
> +    memcpy(info->setup_data, buf, info->setup_data_length);

You have a size parameter, but you don't use it at all. You just trust
that the data that you are copying here does not extend beyond the end
of the buffer. This is wrong.

> +}
> +
> +int ff_hls_parse_audio_setup_info(AVStream *st, HLSAudioSetupInfo *info)
> +{
> +    int ret = 0;
> +
> +    st->codecpar->codec_tag = info->codec_tag;
> +
> +    if (st->codecpar->codec_id == AV_CODEC_ID_AAC)
> +        return 0;
> +
> +    if (st->codecpar->codec_id != AV_CODEC_ID_AC3 && st->codecpar->codec_id != AV_CODEC_ID_EAC3)
> +        return AVERROR_INVALIDDATA;
> +    
> +    st->codecpar->extradata = av_mallocz(info->setup_data_length + AV_INPUT_BUFFER_PADDING_SIZE);
> +
> +    if (!st->codecpar->extradata)
> +        return AVERROR(ENOMEM);
> +
> +    st->codecpar->extradata_size = info->setup_data_length;
> +

Did you forget to copy info->setup_data to st->codecpar->extradata?

> +
> +    if (st->codecpar->codec_id == AV_CODEC_ID_AC3) {
> +
> +        AC3HeaderInfo *ac3hdr = NULL;
> +
> +        ret = avpriv_ac3_parse_header(&ac3hdr, info->setup_data, info->setup_data_length);
> +        if (ret < 0) {
> +            if (ret != AVERROR(ENOMEM)) {
> +                av_free(ac3hdr);
> +            }
> +            return ret;
> +        }
> +
> +        st->codecpar->sample_rate       = ac3hdr->sample_rate;
> +        st->codecpar->channels          = ac3hdr->channels;
> +        st->codecpar->channel_layout    = ac3hdr->channel_layout;
> +        st->codecpar->bit_rate          = ac3hdr->bit_rate;
> +
> +        av_free(ac3hdr);
> +    }
> +    else {  /*  Parse 'dec3' EC3SpecificBox */
> +
> +        GetBitContext gb;
> +        int data_rate, fscod, acmod, lfeon;
> +
> +        ret = init_get_bits8(&gb, info->setup_data, info->setup_data_length);
> +        if (ret < 0)
> +            return AVERROR_INVALIDDATA;
> +
> +        data_rate = get_bits(&gb, 13);
> +        skip_bits(&gb, 3);
> +        fscod = get_bits(&gb, 2);
> +        skip_bits(&gb, 10);
> +        acmod = get_bits(&gb, 3);
> +        lfeon = get_bits(&gb, 1);
> +
> +        st->codecpar->sample_rate = eac3_sample_rate_tab[fscod];
> +
> +        st->codecpar->channel_layout = avpriv_ac3_channel_layout_tab[acmod];
> +        if (lfeon)
> +            st->codecpar->channel_layout |= AV_CH_LOW_FREQUENCY;
> +
> +        st->codecpar->channels = av_get_channel_layout_nb_channels(st->codecpar->channel_layout);
> +
> +        st->codecpar->bit_rate = data_rate*1000;
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * Remove start code emulation prevention 0x03 bytes
> + */
> +static void remove_scep_3_bytes (NALUnit *nalu)
> +{
> +    int i = 0;
> +    int j = 0;
> +
> +    uint8_t *data = nalu->data;
> +
> +    while (i < nalu->length) {
> +        if (nalu->length - i > 3 && data[i] == 0x00 && data[i+1] == 0x00 && data[i+2] == 0x03 &&
> +            (data[i+3] == 0x00 || data[i+3] == 0x01 || data[i+3] == 0x02 || data[i+3] == 0x03)) {

If data + i + 3 is part of the NALU, then data[i + 3] <= 3 is
automatically fulfilled for spec-compliant content, so that the check in
parentheses above is unnecessary; notice that the normative procedure to
remove emulation_prevention_three_bytes does not check this (it just
removes the 0x03), so this is one more reason to not check this. (Btw:
The normative procedure allows the 0x03 to be the last byte of the NALU,
but if I am not mistaken, this case can't happen here for valid content,
because the unencrypted NALU can't end with 0x00.)

> +            data[j] = 0x00;
> +            data[j+1] = 0x00;
> +            data[j+2] = data[i+3];
> +            i += 4;
> +            j += 3;

This is wrong: You must not already copy and consume data + i + 3. If
something like 0xFF 00 00 03 00 00 03 (0xFF exists only to reset the
internal unescaping state) is to be unescaped, the end result with your
algorithm is 0xFF 00 00 00 00 03, yet it should be 0xFF 00 00 00 00. See
the unescaping algorithm in 7.3.1 of the H.264 specification.

> +        } else {
> +            data[j++] = data[i++];
> +        }
> +    }
> +
> +    nalu->length = j;
> +}
> +
> +static int is_start_code (const uint8_t *buf, int zeros_in_start_code)
> +{
> +  int i;
> +
> +  for (i = 0; i < zeros_in_start_code; i++) {
> +    if(*(buf++) != 0x00) {
> +      return 0;
> +    }
> +  }
> +
> +  if(*buf != 0x01)
> +    return 0;
> +
> +  return 1;
> +}
> +
> +static int get_next_nal_unit (CodecParserContext *ctx, NALUnit *nalu)
> +{
> +    int i;
> +      int len = 0;
> +    int nalu_start_offset = 0;
> +
> +    uint8_t *buf_out = ctx->buf_out;
> +
> +    if (ctx->next_start_code_length != 0) {
> +        for (i = 0; i < ctx->next_start_code_length - 1; i++) {
> +          *buf_out++ = 0;
> +          len++;
> +        }
> +        *buf_out++ = 1;
> +        len++;
> +        ctx->next_start_code_length = 0;
> +      } else {

Wrong indentation.

> +        while (ctx->buf_in < ctx->buf_end) {
> +          len++;
> +          if ((*buf_out++ = *ctx->buf_in++) != 0)
> +              break;
> +        }
> +    }
> +
> +    if (ctx->buf_in >= ctx->buf_end) {
> +        if (len == 0)
> +              return 0;
> +        else
> +              return -1;
> +    }
> +
> +    /* No start code at the beginning of the NAL unit */
> +    if(*(ctx->buf_in - 1) != 1 || len < 3) {
> +        return -1;
> +    }
> +
> +    nalu_start_offset = len;
> +
> +    while (ctx->next_start_code_length == 0) {
> +        if (ctx->buf_in >= ctx->buf_end) {
> +            nalu->data   = ctx->buf_out + nalu_start_offset;
> +            nalu->length = len - nalu_start_offset;
> +            nalu->type   = *nalu->data & 0x1F;
> +            ctx->buf_out += nalu_start_offset;
> +            return 0;
> +        }
> +        *buf_out++ = *ctx->buf_in++;

Your current approach is to copy every NAL unit one byte at a time. This
seems to be quite suboptimal to me. How about you first find the limits
of the NAL unit and if the NAL unit needs to be moved, you copy it to
its destination with memmove. I wouldn't be surprised if the NAL unit
will already end up at its correct destination most of the time (given
that emulation prevention bytes are rare; moreover all those pictures
that only use one slice won't be moved at all).

This will also allow to use one of our standard functions to find start
codes. The way you are doing it here is probably quite slow.

> +        len++;
> +        if (is_start_code(ctx->buf_in - 4, 3))
> +            ctx->next_start_code_length = 4;
> +        else if (is_start_code(ctx->buf_in - 3, 2))
> +            ctx->next_start_code_length = 3;
> +        else
> +            ctx->next_start_code_length = 0;
> +    }
> +
> +    len -= ctx->next_start_code_length;
> +
> +    nalu->data	 = ctx->buf_out + nalu_start_offset;
> +    nalu->length = len - nalu_start_offset;
> +    nalu->type	 = *nalu->data & 0x1F;
> +    ctx->buf_out += nalu_start_offset;
> +    return 0;
> +}
> +
> +static int decrypt_nal_unit (HLSCryptoContext *crypto_ctx, NALUnit *nalu)
> +{
> +    int ret = 0;
> +    int rem_bytes;
> +    uint8_t *data;
> +    uint8_t	iv[16];
> +    uint8_t	decrypted_block[16];
> +
> +    struct AVAES *aes_ctx = av_aes_alloc();

Allocating a new AVAES context for every encrypted NAL unit seems to be
a complete waste. Why don't you keep it in the context and reinitialize
it before every use?

> +    if (!aes_ctx) {
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    ret = av_aes_init(aes_ctx, crypto_ctx->key, 16 * 8, 1);
> +    if (ret < 0) {
> +        av_free(aes_ctx);
> +        return ret;
> +    }
> +
> +    /* Remove start code emulation prevention 0x03 bytes */
> +    remove_scep_3_bytes(nalu);
> +
> +    data = nalu->data + 32;
> +    rem_bytes = nalu->length - 32;
> +
> +    memcpy(iv, crypto_ctx->iv, 16);
> +
> +    while (rem_bytes > 0) {
> +        if (rem_bytes > 16) {
> +            av_aes_crypt(aes_ctx, decrypted_block, data, 1, iv, 1);
> +            memcpy(iv, data, 16);
> +            memcpy(data, decrypted_block, 16);

av_aes_crypt already updates the initialization vector and it allows src
and dst to coincide. So I think you can just use av_aes_crypt(aes_ctx,
data, data, 1, iv, 1); above and remove decrypted_block altogether.

> +            data += 16;
> +            rem_bytes -= 16;
> +        }
> +        data += 144;
> +        rem_bytes -= 144;

According to
https://developer.apple.com/library/archive/documentation/AudioVideo/Conceptual/HLS_Sample_Encryption/Encryption/Encryption.html#//apple_ref/doc/uid/TP40012862-CH2-SW8
this should be FFMIN(144, rem_bytes).

> +    }
> +
> +    av_free(aes_ctx);
> +
> +    return 0;
> +}
> +
> +static int decrypt_video_frame (HLSCryptoContext *crypto_ctx, AVPacket *pkt)
> +{
> +    int ret = 0;
> +    CodecParserContext  ctx;
> +    NALUnit nalu;
> +
> +    memset(&ctx, 0, sizeof(ctx));
> +    ctx.buf_in  = pkt->data;
> +    ctx.buf_out = pkt->data;
> +    ctx.buf_end = pkt->data + pkt->size;
> +
> +    while (ctx.buf_in < ctx.buf_end) {
> +        memset(&nalu, 0, sizeof(nalu));
> +        ret = get_next_nal_unit(&ctx, &nalu);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        if ((nalu.type == 0x01 || nalu.type == 0x05) && nalu.length > 48) {
> +            ret = decrypt_nal_unit(crypto_ctx, &nalu);
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        }
> +        ctx.buf_out  += nalu.length;
> +    }
> +
> +    av_shrink_packet(pkt, ctx.buf_out - pkt->data);
> +
> +    return 0;
> +}
> +
> +static int get_next_adts_frame (CodecParserContext *ctx, AudioFrame *frame)
> +{
> +    int ret = 0;
> +
> +    AACADTSHeaderInfo *adts_hdr = NULL;
> +
> +    /* Find next sync word 0xFFF */
> +    while (ctx->buf_in < ctx->buf_end - 1) {
> +        if (*ctx->buf_in == 0xFF && *(ctx->buf_in + 1) & 0xF0 == 0xF0)
> +            break;
> +        ctx->buf_in++;
> +    }
> +
> +    if (ctx->buf_in >= ctx->buf_end - 1) {
> +        return -1;
> +    }
> +
> +    frame->data = (uint8_t*)ctx->buf_in;
> +
> +    ret = avpriv_adts_header_parse (&adts_hdr, frame->data, ctx->buf_end - frame->data);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    frame->header_length = adts_hdr->crc_absent ? AV_AAC_ADTS_HEADER_SIZE : AV_AAC_ADTS_HEADER_SIZE + 2;
> +    frame->length = adts_hdr->frame_length;
> +
> +    av_free(adts_hdr);
> +
> +    return 0;
> +}
> +
> +static int get_next_ac3_eac3_sync_frame (CodecParserContext *ctx, AudioFrame *frame)
> +{
> +    int ret = 0;
> +
> +    AC3HeaderInfo *hdr = NULL;
> +
> +    /* Find next sync word 0x0B77 */
> +    while (ctx->buf_in < ctx->buf_end - 1) {
> +        if (*ctx->buf_in == 0x0B && *(ctx->buf_in + 1) == 0x77)
> +            break;
> +        ctx->buf_in++;
> +    }
> +
> +    if (ctx->buf_in >= ctx->buf_end - 1) {
> +        return -1;
> +    }
> +
> +    frame->data = (uint8_t*)ctx->buf_in;
> +    frame->header_length = 0;
> +
> +    ret = avpriv_ac3_parse_header(&hdr, frame->data, ctx->buf_end - frame->data);
> +    if (ret < 0) {
> +        if (ret != AVERROR(ENOMEM)) {
> +            av_free(hdr);
> +        }
> +        return ret;
> +    }
> +
> +    frame->length = hdr->frame_size;
> +
> +    av_free(hdr);
> +
> +    return 0;
> +}
> +
> +static int get_next_sync_frame (enum AVCodecID codec_id, CodecParserContext *ctx, AudioFrame *frame)
> +{
> +    if (codec_id == AV_CODEC_ID_AAC)
> +        return get_next_adts_frame(ctx, frame);
> +    else if (codec_id == AV_CODEC_ID_AC3 || codec_id == AV_CODEC_ID_EAC3)
> +        return get_next_ac3_eac3_sync_frame(ctx, frame);
> +    else
> +        return AVERROR_INVALIDDATA;
> +}
> +
> +
> +static int decrypt_sync_frame (enum AVCodecID codec_id, HLSCryptoContext *crypto_ctx, AudioFrame *frame)
> +{
> +    int ret = 0;
> +    uint8_t *data;
> +    uint8_t	*decrypted_data;
> +    int num_of_encrypted_blocks;
> +
> +    struct AVAES *aes_ctx = av_aes_alloc();
> +    if (!aes_ctx) {
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    ret = av_aes_init(aes_ctx, crypto_ctx->key, 16 * 8, 1);
> +    if (ret < 0) {
> +        av_free(aes_ctx);
> +        return ret;
> +    }
> +
> +    data = frame->data + frame->header_length + 16;
> +
> +    num_of_encrypted_blocks = (frame->length - frame->header_length - 16)/16;
> +
> +    decrypted_data = av_mallocz(num_of_encrypted_blocks*16);
> +    if (!decrypted_data) {
> +        return AVERROR(ENOMEM);
> +    }
> +
> +    av_aes_crypt(aes_ctx, decrypted_data, data, num_of_encrypted_blocks, crypto_ctx->iv, 1);
> +
> +    if (codec_id == AV_CODEC_ID_EAC3)
> +        memcpy(crypto_ctx->iv, data + (num_of_encrypted_blocks - 1)*16, 16);
> +
> +    memcpy(data, decrypted_data, num_of_encrypted_blocks*16);
> +
> +    av_free(decrypted_data);
> +    av_free(aes_ctx);
> +
> +    return 0;
> +}
> +
> +static int decrypt_audio_frame (enum AVCodecID codec_id, HLSCryptoContext *crypto_ctx, AVPacket *pkt)
> +{
> +    int ret = 0;
> +    CodecParserContext  ctx;
> +    AudioFrame frame;
> +
> +    memset(&ctx, 0, sizeof(ctx));
> +    ctx.buf_in 	= pkt->data;
> +    ctx.buf_end = pkt->data + pkt->size;
> +
> +    while (ctx.buf_in < ctx.buf_end) {
> +        memset(&frame, 0, sizeof(frame));
> +        ret = get_next_sync_frame(codec_id, &ctx, &frame);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        if (frame.length - frame.header_length > 31) {
> +            ret = decrypt_sync_frame(codec_id, crypto_ctx, &frame);
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        }
> +        ctx.buf_in += frame.length;
> +    }
> +
> +    return 0;
> +}
> +
> +
> +int ff_hls_decrypt_frame (enum AVCodecID codec_id, HLSCryptoContext *crypto_ctx, AVPacket *pkt)
> +{
> +    if (codec_id == AV_CODEC_ID_H264)
> +        return decrypt_video_frame(crypto_ctx, pkt);
> +    else if (codec_id == AV_CODEC_ID_AAC || codec_id == AV_CODEC_ID_AC3 || codec_id == AV_CODEC_ID_EAC3)
> +        return decrypt_audio_frame(codec_id, crypto_ctx, pkt);
> +
> +    return AVERROR_INVALIDDATA;
> +}
> diff --git a/libavformat/hls_sample_aes.h b/libavformat/hls_sample_aes.h
> new file mode 100644
> index 0000000000..aa0c8dd2a8
> --- /dev/null
> +++ b/libavformat/hls_sample_aes.h
> @@ -0,0 +1,64 @@
> +/*
> + * Apple HTTP Live Streaming Sample Encryption/Decryption
> + *
> + * Copyright (c) 2021 Nachiket Tarate
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * Apple HTTP Live Streaming Sample Encryption
> + * https://developer.apple.com/library/ios/documentation/AudioVideo/Conceptual/HLS_Sample_Encryption
> + */
> +
> +#ifndef AVFORMAT_HLS_SAMPLE_AES_H
> +#define AVFORMAT_HLS_SAMPLE_AES_H
> +
> +#include <stdint.h>
> +
> +#include "avformat.h"
> +
> +#include "libavcodec/avcodec.h"
> +
> +#define HLS_MAX_ID3_TAGS_DATA_LEN	    138
> +#define HLS_MAX_AUDIO_SETUP_DATA_LEN	10
> +
> +
> +typedef struct HLSCryptoContext {
> +    uint8_t 		key[16];
> +    uint8_t 		iv[16];
> +} HLSCryptoContext;
> +
> +typedef struct HLSAudioSetupInfo {
> +    enum AVCodecID      codec_id;
> +    uint32_t            codec_tag;
> +    uint16_t            priming;
> +    uint8_t             version;
> +    uint8_t             setup_data_length;
> +    uint8_t             setup_data[HLS_MAX_AUDIO_SETUP_DATA_LEN];
> +} HLSAudioSetupInfo;
> +
> +
> +void ff_hls_read_audio_setup_info(HLSAudioSetupInfo *info, const uint8_t *buf, size_t size);
> +
> +int ff_hls_parse_audio_setup_info(AVStream *st, HLSAudioSetupInfo *info);
> +
> +int ff_hls_decrypt_frame (enum AVCodecID codec_id, HLSCryptoContext *crypto_ctx, AVPacket *pkt);
> +
> +#endif /* AVFORMAT_HLS_SAMPLE_AES_H */
> +
> diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
> index e283ec09d7..dc611ae788 100644
> --- a/libavformat/mpegts.c
> +++ b/libavformat/mpegts.c
> @@ -839,6 +839,16 @@ static const StreamType MISC_types[] = {
>      { 0 },
>  };
>  
> +/* HLS Sample Encryption Types  */
> +static const StreamType HLS_SAMPLE_ENC_types[] = {
> +    { 0xdb, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264},
> +    { 0xcf, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC },
> +    { 0xc1, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AC3 },
> +    { 0xc2, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_EAC3},
> +    { 0 },
> +};
> +
> +
>  static const StreamType REGD_types[] = {
>      { MKTAG('d', 'r', 'a', 'c'), AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_DIRAC },
>      { MKTAG('A', 'C', '-', '3'), AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AC3   },
> @@ -948,6 +958,8 @@ static int mpegts_set_stream_info(AVStream *st, PESContext *pes,
>      }
>      if (st->codecpar->codec_id == AV_CODEC_ID_NONE)
>          mpegts_find_stream_type(st, pes->stream_type, MISC_types);
> +    if (st->codecpar->codec_id == AV_CODEC_ID_NONE)
> +        mpegts_find_stream_type(st, pes->stream_type, HLS_SAMPLE_ENC_types);
>      if (st->codecpar->codec_id == AV_CODEC_ID_NONE) {
>          st->codecpar->codec_id  = old_codec_id;
>          st->codecpar->codec_type = old_codec_type;
>