Source: lib/cea/mp4_cea_parser.js

  1. /*! @license
  2. * Shaka Player
  3. * Copyright 2016 Google LLC
  4. * SPDX-License-Identifier: Apache-2.0
  5. */
  6. goog.provide('shaka.cea.Mp4CeaParser');
  7. goog.require('goog.asserts');
  8. goog.require('shaka.cea.CeaUtils');
  9. goog.require('shaka.cea.SeiProcessor');
  10. goog.require('shaka.log');
  11. goog.require('shaka.media.ClosedCaptionParser');
  12. goog.require('shaka.util.DataViewReader');
  13. goog.require('shaka.util.Error');
  14. goog.require('shaka.util.Mp4Parser');
  15. goog.require('shaka.util.Mp4BoxParsers');
  16. /**
  17. * MPEG4 stream parser used for extracting 708 closed captions data.
  18. * @implements {shaka.extern.ICeaParser}
  19. * @export
  20. */
  21. shaka.cea.Mp4CeaParser = class {
  22. constructor() {
  23. /**
  24. * SEI data processor.
  25. * @private
  26. * @const {!shaka.cea.SeiProcessor}
  27. */
  28. this.seiProcessor_ = new shaka.cea.SeiProcessor();
  29. /**
  30. * Map of track id to corresponding timescale.
  31. * @private {!Map<number, number>}
  32. */
  33. this.trackIdToTimescale_ = new Map();
  34. /**
  35. * Default sample duration, as specified by the TREX box.
  36. * @private {number}
  37. */
  38. this.defaultSampleDuration_ = 0;
  39. /**
  40. * Default sample size, as specified by the TREX box.
  41. * @private {number}
  42. */
  43. this.defaultSampleSize_ = 0;
  44. /**
  45. * @private {shaka.cea.Mp4CeaParser.BitstreamFormat}
  46. */
  47. this.bitstreamFormat_ = shaka.cea.Mp4CeaParser.BitstreamFormat.UNKNOWN;
  48. }
  49. /**
  50. * Parses the init segment. Gets Default Sample Duration and Size from the
  51. * TREX box, and constructs a map of Track IDs to timescales. Each TRAK box
  52. * contains a track header (TKHD) containing track ID, and a media header box
  53. * (MDHD) containing the timescale for the track
  54. * @override
  55. */
  56. init(initSegment) {
  57. const Mp4Parser = shaka.util.Mp4Parser;
  58. const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat;
  59. const trackIds = [];
  60. const timescales = [];
  61. const codecBoxParser = (box) => this.setBitstreamFormat_(box.name);
  62. new Mp4Parser()
  63. .box('moov', Mp4Parser.children)
  64. .box('mvex', Mp4Parser.children)
  65. .fullBox('trex', (box) => {
  66. const parsedTREXBox = shaka.util.Mp4BoxParsers.parseTREX(
  67. box.reader);
  68. this.defaultSampleDuration_ = parsedTREXBox.defaultSampleDuration;
  69. this.defaultSampleSize_ = parsedTREXBox.defaultSampleSize;
  70. })
  71. .box('trak', Mp4Parser.children)
  72. .fullBox('tkhd', (box) => {
  73. goog.asserts.assert(
  74. box.version != null,
  75. 'TKHD is a full box and should have a valid version.');
  76. const parsedTKHDBox = shaka.util.Mp4BoxParsers.parseTKHD(
  77. box.reader, box.version);
  78. trackIds.push(parsedTKHDBox.trackId);
  79. })
  80. .box('mdia', Mp4Parser.children)
  81. .fullBox('mdhd', (box) => {
  82. goog.asserts.assert(
  83. box.version != null,
  84. 'MDHD is a full box and should have a valid version.');
  85. const parsedMDHDBox = shaka.util.Mp4BoxParsers.parseMDHD(
  86. box.reader, box.version);
  87. timescales.push(parsedMDHDBox.timescale);
  88. })
  89. .box('minf', Mp4Parser.children)
  90. .box('stbl', Mp4Parser.children)
  91. .fullBox('stsd', Mp4Parser.sampleDescription)
  92. // These are the various boxes that signal a codec.
  93. .box('avc1', codecBoxParser)
  94. .box('avc3', codecBoxParser)
  95. .box('dvav', codecBoxParser)
  96. .box('dva1', codecBoxParser)
  97. .box('hev1', codecBoxParser)
  98. .box('hvc1', codecBoxParser)
  99. .box('dvh1', codecBoxParser)
  100. .box('dvhe', codecBoxParser)
  101. .box('vvc1', codecBoxParser)
  102. .box('vvi1', codecBoxParser)
  103. .box('dvc1', codecBoxParser)
  104. .box('dvi1', codecBoxParser)
  105. // This signals an encrypted sample, which we can go inside of to find
  106. // the codec used.
  107. .box('encv', Mp4Parser.visualSampleEntry)
  108. .box('sinf', Mp4Parser.children)
  109. .box('frma', (box) => {
  110. const {codec} = shaka.util.Mp4BoxParsers.parseFRMA(box.reader);
  111. this.setBitstreamFormat_(codec);
  112. })
  113. .parse(initSegment, /* partialOkay= */ true, /* stopOnPartial= */ true);
  114. // At least one track should exist, and each track should have a
  115. // corresponding Id in TKHD box, and timescale in its MDHD box
  116. if (!trackIds.length|| !timescales.length ||
  117. trackIds.length != timescales.length) {
  118. throw new shaka.util.Error(
  119. shaka.util.Error.Severity.CRITICAL,
  120. shaka.util.Error.Category.TEXT,
  121. shaka.util.Error.Code.INVALID_MP4_CEA);
  122. }
  123. if (this.bitstreamFormat_ == BitstreamFormat.UNKNOWN) {
  124. shaka.log.alwaysWarn(
  125. 'Unable to determine bitstream format for CEA parsing!');
  126. }
  127. // Populate the map from track Id to timescale
  128. trackIds.forEach((trackId, idx) => {
  129. this.trackIdToTimescale_.set(trackId, timescales[idx]);
  130. });
  131. }
  132. /**
  133. * Parses each video segment. In fragmented MP4s, MOOF and MDAT come in
  134. * pairs. The following logic gets the necessary info from MOOFs to parse
  135. * MDATs (base media decode time, sample sizes/offsets/durations, etc),
  136. * and then parses the MDAT boxes for CEA-708 packets using this information.
  137. * CEA-708 packets are returned in the callback.
  138. * @override
  139. */
  140. parse(mediaSegment) {
  141. const Mp4Parser = shaka.util.Mp4Parser;
  142. const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat;
  143. if (this.bitstreamFormat_ == BitstreamFormat.UNKNOWN) {
  144. // We don't know how to extract SEI from this.
  145. return [];
  146. }
  147. /** @type {!Array<!shaka.extern.ICeaParser.CaptionPacket>} **/
  148. const captionPackets = [];
  149. let moofOffset = 0;
  150. /** @type {!Array<!shaka.cea.Mp4CeaParser.ParsedTRAF>} */
  151. let parsedTRAFs = [];
  152. new Mp4Parser()
  153. .box('moof', (box) => {
  154. moofOffset = box.start;
  155. // traf box parsing is reset on each moof.
  156. parsedTRAFs = [];
  157. Mp4Parser.children(box);
  158. })
  159. .box('traf', (box) => {
  160. parsedTRAFs.push({
  161. baseMediaDecodeTime: null,
  162. defaultSampleDuration: this.defaultSampleDuration_,
  163. defaultSampleSize: this.defaultSampleSize_,
  164. parsedTRUNs: [],
  165. timescale: shaka.cea.CeaUtils.DEFAULT_TIMESCALE_VALUE,
  166. });
  167. Mp4Parser.children(box);
  168. })
  169. .fullBox('trun', (box) => {
  170. goog.asserts.assert(
  171. box.version != null && box.flags != null,
  172. 'TRUN is a full box and should have a valid version & flags.');
  173. const lastTRAF = parsedTRAFs[parsedTRAFs.length - 1];
  174. const parsedTRUN = shaka.util.Mp4BoxParsers.parseTRUN(
  175. box.reader, box.version, box.flags);
  176. lastTRAF.parsedTRUNs.push(parsedTRUN);
  177. })
  178. .fullBox('tfhd', (box) => {
  179. goog.asserts.assert(
  180. box.flags != null,
  181. 'TFHD is a full box and should have valid flags.');
  182. const lastTRAF = parsedTRAFs[parsedTRAFs.length - 1];
  183. const parsedTFHD = shaka.util.Mp4BoxParsers.parseTFHD(
  184. box.reader, box.flags);
  185. // If specified, defaultSampleDuration and defaultSampleSize
  186. // override the ones specified in the TREX box
  187. lastTRAF.defaultSampleDuration = parsedTFHD.defaultSampleDuration ||
  188. this.defaultSampleDuration_;
  189. lastTRAF.defaultSampleSize = parsedTFHD.defaultSampleSize ||
  190. this.defaultSampleSize_;
  191. const trackId = parsedTFHD.trackId;
  192. // Get the timescale from the track Id
  193. if (this.trackIdToTimescale_.has(trackId)) {
  194. lastTRAF.timescale = this.trackIdToTimescale_.get(trackId);
  195. }
  196. })
  197. .fullBox('tfdt', (box) => {
  198. goog.asserts.assert(
  199. box.version != null,
  200. 'TFDT is a full box and should have a valid version.');
  201. const lastTRAF = parsedTRAFs[parsedTRAFs.length - 1];
  202. const parsedTFDT = shaka.util.Mp4BoxParsers.parseTFDTInaccurate(
  203. box.reader, box.version);
  204. lastTRAF.baseMediaDecodeTime = parsedTFDT.baseMediaDecodeTime;
  205. })
  206. .box('mdat', (box) => {
  207. const offset = moofOffset - box.start - 8;
  208. const initialPosition = box.reader.getPosition();
  209. for (const parsedTRAF of parsedTRAFs) {
  210. if (parsedTRAF.baseMediaDecodeTime === null) {
  211. // This field should have been populated by the Base Media Decode
  212. // Time in the tfdt box.
  213. shaka.log.alwaysWarn(
  214. 'Unable to find base media decode time for CEA captions!');
  215. continue;
  216. }
  217. box.reader.seek(initialPosition);
  218. this.parseMdat_(box.reader,
  219. parsedTRAF.baseMediaDecodeTime,
  220. parsedTRAF.timescale,
  221. parsedTRAF.defaultSampleDuration,
  222. parsedTRAF.defaultSampleSize,
  223. offset,
  224. parsedTRAF.parsedTRUNs,
  225. captionPackets);
  226. }
  227. })
  228. .parse(mediaSegment, /* partialOkay= */ false,
  229. /* stopOnPartial= */ true);
  230. return captionPackets;
  231. }
  232. /**
  233. * Parse MDAT box.
  234. * @param {!shaka.util.DataViewReader} reader
  235. * @param {number} time
  236. * @param {number} timescale
  237. * @param {number} defaultSampleDuration
  238. * @param {number} defaultSampleSize
  239. * @param {number} offset
  240. * @param {!Array<shaka.util.ParsedTRUNBox>} parsedTRUNs
  241. * @param {!Array<!shaka.extern.ICeaParser.CaptionPacket>} captionPackets
  242. * @private
  243. */
  244. parseMdat_(reader, time, timescale, defaultSampleDuration,
  245. defaultSampleSize, offset, parsedTRUNs, captionPackets) {
  246. const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat;
  247. const CeaUtils = shaka.cea.CeaUtils;
  248. let sampleIndex = 0;
  249. // The fields in each ParsedTRUNSample contained in the sampleData
  250. // array are nullable. In the case of sample data and sample duration,
  251. // we use the defaults provided by the TREX/TFHD boxes. For sample
  252. // composition time offset, we default to 0.
  253. let sampleSize = defaultSampleSize;
  254. // Combine all sample data. This assumes that the samples described across
  255. // multiple trun boxes are still continuous in the mdat box.
  256. const sampleDatas = parsedTRUNs.map((t) => t.sampleData);
  257. const sampleData = [].concat(...sampleDatas);
  258. if (sampleData.length) {
  259. sampleSize = sampleData[0].sampleSize || defaultSampleSize;
  260. }
  261. reader.skip(offset + parsedTRUNs[0].dataOffset);
  262. while (reader.hasMoreData()) {
  263. const naluSize = reader.readUint32();
  264. const naluHeader = reader.readUint8();
  265. let naluType = null;
  266. let isSeiMessage = false;
  267. let naluHeaderSize = 1;
  268. goog.asserts.assert(this.bitstreamFormat_ != BitstreamFormat.UNKNOWN,
  269. 'Bitstream format should have been checked before now!');
  270. switch (this.bitstreamFormat_) {
  271. case BitstreamFormat.H264:
  272. naluType = naluHeader & 0x1f;
  273. isSeiMessage = naluType == CeaUtils.H264_NALU_TYPE_SEI;
  274. break;
  275. case BitstreamFormat.H265:
  276. naluHeaderSize = 2;
  277. reader.skip(1);
  278. naluType = (naluHeader >> 1) & 0x3f;
  279. isSeiMessage =
  280. naluType == CeaUtils.H265_PREFIX_NALU_TYPE_SEI ||
  281. naluType == CeaUtils.H265_SUFFIX_NALU_TYPE_SEI;
  282. break;
  283. case BitstreamFormat.H266:
  284. naluHeaderSize = 2;
  285. reader.skip(1);
  286. naluType = (naluHeader >> 1) & 0x3f;
  287. isSeiMessage =
  288. naluType == CeaUtils.H266_PREFIX_NALU_TYPE_SEI ||
  289. naluType == CeaUtils.H266_SUFFIX_NALU_TYPE_SEI;
  290. break;
  291. default:
  292. return;
  293. }
  294. if (isSeiMessage) {
  295. let timeOffset = 0;
  296. if (sampleIndex < sampleData.length) {
  297. timeOffset = sampleData[sampleIndex].sampleCompositionTimeOffset || 0;
  298. }
  299. const pts = (time + timeOffset) / timescale;
  300. for (const packet of this.seiProcessor_
  301. .process(reader.readBytes(naluSize - naluHeaderSize))) {
  302. captionPackets.push({
  303. packet,
  304. pts,
  305. });
  306. }
  307. } else {
  308. try {
  309. reader.skip(naluSize - naluHeaderSize);
  310. } catch (e) {
  311. // It is necessary to ignore this error because it can break the start
  312. // of playback even if the user does not want to see the subtitles.
  313. break;
  314. }
  315. }
  316. sampleSize -= (naluSize + 4);
  317. if (sampleSize == 0) {
  318. if (sampleIndex < sampleData.length) {
  319. time += sampleData[sampleIndex].sampleDuration ||
  320. defaultSampleDuration;
  321. } else {
  322. time += defaultSampleDuration;
  323. }
  324. sampleIndex++;
  325. if (sampleIndex < sampleData.length) {
  326. sampleSize = sampleData[sampleIndex].sampleSize || defaultSampleSize;
  327. } else {
  328. sampleSize = defaultSampleSize;
  329. }
  330. }
  331. }
  332. }
  333. /**
  334. * @param {string} codec A fourcc for a codec.
  335. * @private
  336. */
  337. setBitstreamFormat_(codec) {
  338. if (shaka.cea.Mp4CeaParser.CodecBitstreamMap_.has(codec)) {
  339. this.bitstreamFormat_ =
  340. shaka.cea.Mp4CeaParser.CodecBitstreamMap_.get(codec);
  341. }
  342. }
  343. };
  344. /** @enum {number} */
  345. shaka.cea.Mp4CeaParser.BitstreamFormat = {
  346. UNKNOWN: 0,
  347. H264: 1,
  348. H265: 2,
  349. H266: 3,
  350. };
  351. /** @private {Map<string, shaka.cea.Mp4CeaParser.BitstreamFormat>} */
  352. shaka.cea.Mp4CeaParser.CodecBitstreamMap_ = new Map()
  353. // AVC
  354. .set('avc1', shaka.cea.Mp4CeaParser.BitstreamFormat.H264)
  355. .set('avc3', shaka.cea.Mp4CeaParser.BitstreamFormat.H264)
  356. // Dolby Vision based in AVC
  357. .set('dvav', shaka.cea.Mp4CeaParser.BitstreamFormat.H264)
  358. .set('dva1', shaka.cea.Mp4CeaParser.BitstreamFormat.H264)
  359. // HEVC
  360. .set('hev1', shaka.cea.Mp4CeaParser.BitstreamFormat.H265)
  361. .set('hvc1', shaka.cea.Mp4CeaParser.BitstreamFormat.H265)
  362. // Dolby Vision based in HEVC
  363. .set('dvh1', shaka.cea.Mp4CeaParser.BitstreamFormat.H265)
  364. .set('dvhe', shaka.cea.Mp4CeaParser.BitstreamFormat.H265)
  365. // VVC
  366. .set('vvc1', shaka.cea.Mp4CeaParser.BitstreamFormat.H266)
  367. .set('vvi1', shaka.cea.Mp4CeaParser.BitstreamFormat.H266)
  368. // Dolby Vision based in VVC
  369. .set('dvc1', shaka.cea.Mp4CeaParser.BitstreamFormat.H266)
  370. .set('dvi1', shaka.cea.Mp4CeaParser.BitstreamFormat.H266);
  371. /**
  372. * @typedef {{
  373. * baseMediaDecodeTime: ?number,
  374. * defaultSampleDuration: number,
  375. * defaultSampleSize: number,
  376. * parsedTRUNs: !Array<shaka.util.ParsedTRUNBox>,
  377. * timescale: number,
  378. * }}
  379. *
  380. * @property {?number} baseMediaDecodeTime
  381. * @property {number} defaultSampleDuration
  382. * @property {number} defaultSampleSize
  383. * @property {!Array<shaka.util.ParsedTRUNBox>} parsedTRUNs
  384. * @property {?number} timescale
  385. */
  386. shaka.cea.Mp4CeaParser.ParsedTRAF;
  387. shaka.media.ClosedCaptionParser.registerParser('video/mp4',
  388. () => new shaka.cea.Mp4CeaParser());