| /* This Source Code Form is subject to the terms of the Mozilla Public |
| * License, v. 2.0. If a copy of the MPL was not distributed with this |
| * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ |
| |
| use mime::{self, Mime}; |
| |
| use crate::LoadContext; |
| |
| pub struct MimeClassifier { |
| image_classifier: GroupedClassifier, |
| audio_video_classifier: GroupedClassifier, |
| scriptable_classifier: GroupedClassifier, |
| plaintext_classifier: GroupedClassifier, |
| archive_classifier: GroupedClassifier, |
| binary_or_plaintext: BinaryOrPlaintextClassifier, |
| font_classifier: GroupedClassifier, |
| } |
| |
| #[derive(PartialEq)] |
| pub enum MediaType { |
| Xml, |
| Html, |
| AudioVideo, |
| Image, |
| JavaScript, |
| Json, |
| Font, |
| Text, |
| Css, |
| } |
| |
| #[derive(PartialEq)] |
| pub enum ApacheBugFlag { |
| On, |
| Off, |
| } |
| |
| impl ApacheBugFlag { |
| /// <https://mimesniff.spec.whatwg.org/#supplied-mime-type-detection-algorithm> |
| pub fn from_content_type(mime_type: Option<&Mime>) -> ApacheBugFlag { |
| // TODO(36801): also handle charset ISO-8859-1 |
| if mime_type.is_some_and(|mime_type| { |
| *mime_type == mime::TEXT_PLAIN || *mime_type == mime::TEXT_PLAIN_UTF_8 |
| }) { |
| ApacheBugFlag::On |
| } else { |
| ApacheBugFlag::Off |
| } |
| } |
| } |
| |
| #[derive(PartialEq)] |
| pub enum NoSniffFlag { |
| On, |
| Off, |
| } |
| |
| impl Default for MimeClassifier { |
| fn default() -> Self { |
| Self { |
| image_classifier: GroupedClassifier::image_classifer(), |
| audio_video_classifier: GroupedClassifier::audio_video_classifier(), |
| scriptable_classifier: GroupedClassifier::scriptable_classifier(), |
| plaintext_classifier: GroupedClassifier::plaintext_classifier(), |
| archive_classifier: GroupedClassifier::archive_classifier(), |
| binary_or_plaintext: BinaryOrPlaintextClassifier, |
| font_classifier: GroupedClassifier::font_classifier(), |
| } |
| } |
| } |
| |
| impl MimeClassifier { |
| /// <https://mimesniff.spec.whatwg.org/#mime-type-sniffing-algorithm> |
| pub fn classify<'a>( |
| &'a self, |
| context: LoadContext, |
| no_sniff_flag: NoSniffFlag, |
| apache_bug_flag: ApacheBugFlag, |
| supplied_type: &Option<Mime>, |
| data: &'a [u8], |
| ) -> Mime { |
| let supplied_type_or_octet_stream = supplied_type |
| .clone() |
| .unwrap_or(mime::APPLICATION_OCTET_STREAM); |
| // Step 1. If the supplied MIME type is an XML MIME type or HTML MIME type, |
| // the computed MIME type is the supplied MIME type. |
| if Self::is_xml(&supplied_type_or_octet_stream) || |
| Self::is_html(&supplied_type_or_octet_stream) |
| { |
| return supplied_type_or_octet_stream; |
| } |
| match context { |
| LoadContext::Browsing => match *supplied_type { |
| // Step 2. If the supplied MIME type is undefined or if the supplied MIME type’s essence is "unknown/unknown", |
| // "application/unknown", or "*/*", execute the rules for identifying |
| // an unknown MIME type with the sniff-scriptable flag equal to the inverse of the no-sniff flag and abort these steps. |
| None => self.sniff_unknown_type(no_sniff_flag, data), |
| Some(ref supplied_type) => { |
| if MimeClassifier::is_explicit_unknown(supplied_type) { |
| return self.sniff_unknown_type(no_sniff_flag, data); |
| } |
| // Step 3. If the no-sniff flag is set, the computed MIME type is the supplied MIME type. |
| // Abort these steps. |
| if no_sniff_flag == NoSniffFlag::On { |
| return supplied_type.clone(); |
| } |
| // Step 4. If the check-for-apache-bug flag is set, |
| // execute the rules for distinguishing if a resource is text or binary and abort these steps. |
| if apache_bug_flag == ApacheBugFlag::On { |
| return self.sniff_text_or_data(data); |
| } |
| match MimeClassifier::get_media_type(supplied_type) { |
| // Step 5. If the supplied MIME type is an image MIME type supported by the user agent, |
| // let matched-type be the result of executing the image type pattern matching algorithm with |
| // the resource header as the byte sequence to be matched. |
| Some(MediaType::Image) => { |
| // Step 6. If matched-type is not undefined, the computed MIME type is matched-type. |
| self.image_classifier.classify(data) |
| }, |
| // Step 7. If the supplied MIME type is an audio or video MIME type supported by the user agent, |
| // let matched-type be the result of executing the audio or video type pattern matching algorithm |
| // with the resource header as the byte sequence to be matched. |
| Some(MediaType::AudioVideo) => { |
| // Step 8. If matched-type is not undefined, the computed MIME type is matched-type. |
| self.audio_video_classifier.classify(data) |
| }, |
| Some(MediaType::Html) | Some(MediaType::Xml) => unreachable!(), |
| _ => None, |
| } |
| // Step 9. The computed MIME type is the supplied MIME type. |
| .unwrap_or(supplied_type.clone()) |
| }, |
| }, |
| LoadContext::Image => { |
| // Section 8.2 Sniffing an image context |
| match MimeClassifier::maybe_get_media_type(supplied_type) { |
| Some(MediaType::Xml) => None, |
| _ => self.image_classifier.classify(data), |
| } |
| .unwrap_or(supplied_type_or_octet_stream) |
| }, |
| LoadContext::AudioVideo => { |
| // Section 8.3 Sniffing an image context |
| match MimeClassifier::maybe_get_media_type(supplied_type) { |
| Some(MediaType::Xml) => None, |
| _ => self.audio_video_classifier.classify(data), |
| } |
| .unwrap_or(supplied_type_or_octet_stream) |
| }, |
| LoadContext::Plugin => { |
| // 8.4 Sniffing in a plugin context |
| // |
| // This section was *not* finalized in the specs at the time |
| // of this implementation. |
| match *supplied_type { |
| None => mime::APPLICATION_OCTET_STREAM, |
| _ => supplied_type_or_octet_stream, |
| } |
| }, |
| LoadContext::Style => { |
| // 8.5 Sniffing in a style context |
| // |
| // This section was *not* finalized in the specs at the time |
| // of this implementation. |
| match *supplied_type { |
| None => mime::TEXT_CSS, |
| _ => supplied_type_or_octet_stream, |
| } |
| }, |
| LoadContext::Script => { |
| // 8.6 Sniffing in a script context |
| // |
| // This section was *not* finalized in the specs at the time |
| // of this implementation. |
| match *supplied_type { |
| None => mime::TEXT_JAVASCRIPT, |
| _ => supplied_type_or_octet_stream, |
| } |
| }, |
| LoadContext::Font => { |
| // 8.7 Sniffing in a font context |
| match MimeClassifier::maybe_get_media_type(supplied_type) { |
| Some(MediaType::Xml) => None, |
| _ => self.font_classifier.classify(data), |
| } |
| .unwrap_or(supplied_type_or_octet_stream) |
| }, |
| LoadContext::TextTrack => { |
| // 8.8 Sniffing in a text track context |
| // |
| // This section was *not* finalized in the specs at the time |
| // of this implementation. |
| "text/vtt".parse().unwrap() |
| }, |
| LoadContext::CacheManifest => { |
| // 8.9 Sniffing in a cache manifest context |
| // |
| // This section was *not* finalized in the specs at the time |
| // of this implementation. |
| "text/cache-manifest".parse().unwrap() |
| }, |
| } |
| } |
| |
| pub fn validate(&self) -> Result<(), String> { |
| self.image_classifier.validate()?; |
| self.audio_video_classifier.validate()?; |
| self.scriptable_classifier.validate()?; |
| self.plaintext_classifier.validate()?; |
| self.archive_classifier.validate()?; |
| self.binary_or_plaintext.validate()?; |
| self.font_classifier.validate()?; |
| Ok(()) |
| } |
| |
| // some sort of iterator over the classifiers might be better? |
| fn sniff_unknown_type(&self, no_sniff_flag: NoSniffFlag, data: &[u8]) -> Mime { |
| let should_sniff_scriptable = no_sniff_flag == NoSniffFlag::Off; |
| let sniffed = if should_sniff_scriptable { |
| self.scriptable_classifier.classify(data) |
| } else { |
| None |
| }; |
| |
| sniffed |
| .or_else(|| self.plaintext_classifier.classify(data)) |
| .or_else(|| self.image_classifier.classify(data)) |
| .or_else(|| self.audio_video_classifier.classify(data)) |
| .or_else(|| self.archive_classifier.classify(data)) |
| .or_else(|| self.binary_or_plaintext.classify(data)) |
| .expect("BinaryOrPlaintextClassifier always succeeds") |
| } |
| |
| fn sniff_text_or_data<'a>(&'a self, data: &'a [u8]) -> Mime { |
| self.binary_or_plaintext |
| .classify(data) |
| .expect("BinaryOrPlaintextClassifier always succeeds") |
| } |
| |
| /// <https://mimesniff.spec.whatwg.org/#xml-mime-type> |
| fn is_xml(mt: &Mime) -> bool { |
| mt.suffix() == Some(mime::XML) || |
| mt.essence_str() == "text/xml" || |
| mt.essence_str() == "application/xml" |
| } |
| |
| /// <https://mimesniff.spec.whatwg.org/#html-mime-type> |
| fn is_html(mt: &Mime) -> bool { |
| mt.essence_str() == "text/html" |
| } |
| |
| /// <https://mimesniff.spec.whatwg.org/#image-mime-type> |
| fn is_image(mt: &Mime) -> bool { |
| mt.type_() == mime::IMAGE |
| } |
| |
| /// <https://mimesniff.spec.whatwg.org/#audio-or-video-mime-type> |
| fn is_audio_video(mt: &Mime) -> bool { |
| mt.type_() == mime::AUDIO || |
| mt.type_() == mime::VIDEO || |
| mt.essence_str() == "application/ogg" |
| } |
| |
| fn is_explicit_unknown(mt: &Mime) -> bool { |
| mt.type_().as_str() == "unknown" && mt.subtype().as_str() == "unknown" || |
| mt.type_() == mime::APPLICATION && mt.subtype().as_str() == "unknown" || |
| mt.type_() == mime::STAR && mt.subtype() == mime::STAR |
| } |
| |
| /// <https://mimesniff.spec.whatwg.org/#javascript-mime-type> |
| fn is_javascript(mt: &Mime) -> bool { |
| (mt.type_() == mime::APPLICATION && |
| (["ecmascript", "javascript", "x-ecmascript", "x-javascript"] |
| .contains(&mt.subtype().as_str()))) || |
| (mt.type_() == mime::TEXT && |
| ([ |
| "ecmascript", |
| "javascript", |
| "javascript1.0", |
| "javascript1.1", |
| "javascript1.2", |
| "javascript1.3", |
| "javascript1.4", |
| "javascript1.5", |
| "jscript", |
| "livescript", |
| "x-ecmascript", |
| "x-javascript", |
| ] |
| .contains(&mt.subtype().as_str()))) |
| } |
| |
| /// <https://mimesniff.spec.whatwg.org/#json-mime-type> |
| fn is_json(mt: &Mime) -> bool { |
| mt.suffix() == Some(mime::JSON) || |
| (mt.subtype() == mime::JSON && |
| (mt.type_() == mime::APPLICATION || mt.type_() == mime::TEXT)) |
| } |
| |
| /// <https://mimesniff.spec.whatwg.org/#font-mime-type> |
| fn is_font(mt: &Mime) -> bool { |
| mt.type_() == mime::FONT || |
| (mt.type_() == mime::APPLICATION && |
| ([ |
| "font-cff", |
| "font-off", |
| "font-sfnt", |
| "font-ttf", |
| "font-woff", |
| "vnd.ms-fontobject", |
| "vnd.ms-opentype", |
| ] |
| .contains(&mt.subtype().as_str()))) |
| } |
| |
| fn is_text(mt: &Mime) -> bool { |
| *mt == mime::TEXT_PLAIN || mt.essence_str() == "text/vtt" |
| } |
| |
| fn is_css(mt: &Mime) -> bool { |
| mt.essence_str() == "text/css" |
| } |
| |
| pub fn get_media_type(mime: &Mime) -> Option<MediaType> { |
| if MimeClassifier::is_xml(mime) { |
| Some(MediaType::Xml) |
| } else if MimeClassifier::is_html(mime) { |
| Some(MediaType::Html) |
| } else if MimeClassifier::is_image(mime) { |
| Some(MediaType::Image) |
| } else if MimeClassifier::is_audio_video(mime) { |
| Some(MediaType::AudioVideo) |
| } else if MimeClassifier::is_javascript(mime) { |
| Some(MediaType::JavaScript) |
| } else if MimeClassifier::is_font(mime) { |
| Some(MediaType::Font) |
| } else if MimeClassifier::is_json(mime) { |
| Some(MediaType::Json) |
| } else if MimeClassifier::is_text(mime) { |
| Some(MediaType::Text) |
| } else if MimeClassifier::is_css(mime) { |
| Some(MediaType::Css) |
| } else { |
| None |
| } |
| } |
| |
| fn maybe_get_media_type(supplied_type: &Option<Mime>) -> Option<MediaType> { |
| supplied_type |
| .as_ref() |
| .and_then(MimeClassifier::get_media_type) |
| } |
| } |
| |
| // Interface used for composite types |
| trait MIMEChecker { |
| fn classify(&self, data: &[u8]) -> Option<Mime>; |
| /// Validate the MIME checker configuration |
| fn validate(&self) -> Result<(), String>; |
| } |
| |
| struct ByteMatcher { |
| pattern: &'static [u8], |
| mask: &'static [u8], |
| leading_ignore: &'static [u8], |
| content_type: Mime, |
| } |
| |
| impl ByteMatcher { |
| fn matches(&self, data: &[u8]) -> Option<usize> { |
| if data.len() < self.pattern.len() { |
| None |
| } else if data == self.pattern { |
| Some(self.pattern.len()) |
| } else { |
| data[..data.len() - self.pattern.len() + 1] |
| .iter() |
| .position(|x| !self.leading_ignore.contains(x)) |
| .and_then(|start| { |
| if data[start..] |
| .iter() |
| .zip(self.pattern.iter()) |
| .zip(self.mask.iter()) |
| .all(|((&data, &pattern), &mask)| (data & mask) == pattern) |
| { |
| Some(start + self.pattern.len()) |
| } else { |
| None |
| } |
| }) |
| } |
| } |
| } |
| |
| impl MIMEChecker for ByteMatcher { |
| fn classify(&self, data: &[u8]) -> Option<Mime> { |
| self.matches(data).map(|_| self.content_type.clone()) |
| } |
| |
| fn validate(&self) -> Result<(), String> { |
| if self.pattern.is_empty() { |
| return Err(format!("Zero length pattern for {:?}", self.content_type)); |
| } |
| if self.pattern.len() != self.mask.len() { |
| return Err(format!( |
| "Unequal pattern and mask length for {:?}", |
| self.content_type |
| )); |
| } |
| if self |
| .pattern |
| .iter() |
| .zip(self.mask.iter()) |
| .any(|(&pattern, &mask)| pattern & mask != pattern) |
| { |
| return Err(format!( |
| "Pattern not pre-masked for {:?}", |
| self.content_type |
| )); |
| } |
| Ok(()) |
| } |
| } |
| |
| struct TagTerminatedByteMatcher { |
| matcher: ByteMatcher, |
| } |
| |
| impl MIMEChecker for TagTerminatedByteMatcher { |
| fn classify(&self, data: &[u8]) -> Option<Mime> { |
| self.matcher.matches(data).and_then(|j| { |
| if j < data.len() && (data[j] == b' ' || data[j] == b'>') { |
| Some(self.matcher.content_type.clone()) |
| } else { |
| None |
| } |
| }) |
| } |
| |
| fn validate(&self) -> Result<(), String> { |
| self.matcher.validate() |
| } |
| } |
| |
| pub struct Mp4Matcher; |
| |
| impl Mp4Matcher { |
| /// <https://mimesniff.spec.whatwg.org/#matches-the-signature-for-mp4> |
| pub fn matches(&self, data: &[u8]) -> bool { |
| // Step 1. Let sequence be the byte sequence to be matched, |
| // where sequence[s] is byte s in sequence and sequence[0] is the first byte in sequence. |
| // Step 2. Let length be the number of bytes in sequence. |
| // Step 3. If length is less than 12, return false. |
| if data.len() < 12 { |
| return false; |
| } |
| |
| // Step 4. Let box-size be the four bytes from sequence[0] to sequence[3], |
| // interpreted as a 32-bit unsigned big-endian integer. |
| let box_size = (((data[0] as u32) << 24) | |
| ((data[1] as u32) << 16) | |
| ((data[2] as u32) << 8) | |
| (data[3] as u32)) as usize; |
| // Step 5. If length is less than box-size or if box-size modulo 4 is not equal to 0, return false. |
| if (data.len() < box_size) || (box_size % 4 != 0) { |
| return false; |
| } |
| |
| // Step 6. If the four bytes from sequence[4] to sequence[7] are not equal to 0x66 0x74 0x79 0x70 ("ftyp"), return false. |
| let ftyp = [0x66, 0x74, 0x79, 0x70]; |
| if !data[4..].starts_with(&ftyp) { |
| return false; |
| } |
| |
| // Step 7. If the three bytes from sequence[8] to sequence[10] are equal to 0x6D 0x70 0x34 ("mp4"), return true. |
| let mp4 = [0x6D, 0x70, 0x34]; |
| data[8..].starts_with(&mp4) || |
| // Step 8. Let bytes-read be 16. |
| // Step 9. While bytes-read is less than box-size, continuously loop through these steps: |
| data[16..box_size] |
| // Step 11. Increment bytes-read by 4. |
| .chunks(4) |
| // Step 10. If the three bytes from sequence[bytes-read] to sequence[bytes-read + 2] |
| // are equal to 0x6D 0x70 0x34 ("mp4"), return true. |
| .any(|chunk| chunk.starts_with(&mp4)) |
| // Step 12. Return false. |
| } |
| } |
| impl MIMEChecker for Mp4Matcher { |
| fn classify(&self, data: &[u8]) -> Option<Mime> { |
| if self.matches(data) { |
| Some("video/mp4".parse().unwrap()) |
| } else { |
| None |
| } |
| } |
| |
| fn validate(&self) -> Result<(), String> { |
| Ok(()) |
| } |
| } |
| |
| struct BinaryOrPlaintextClassifier; |
| |
| impl BinaryOrPlaintextClassifier { |
| /// <https://mimesniff.spec.whatwg.org/#rules-for-text-or-binary> |
| fn classify_impl(&self, data: &[u8]) -> Mime { |
| // Step 1. Let length be the number of bytes in the resource header. |
| // Step 2. If length is greater than or equal to 2 and |
| // the first 2 bytes of the resource header are equal to 0xFE 0xFF (UTF-16BE BOM) |
| // or 0xFF 0xFE (UTF-16LE BOM), the computed MIME type is "text/plain". |
| // Step 3. If length is greater than or equal to 3 |
| // and the first 3 bytes of the resource header are equal to |
| // 0xEF 0xBB 0xBF (UTF-8 BOM), the computed MIME type is "text/plain". |
| if data.starts_with(&[0xFFu8, 0xFEu8]) || |
| data.starts_with(&[0xFEu8, 0xFFu8]) || |
| data.starts_with(&[0xEFu8, 0xBBu8, 0xBFu8]) |
| { |
| mime::TEXT_PLAIN |
| } else if data.iter().any(|&x| { |
| x <= 0x08u8 || |
| x == 0x0Bu8 || |
| (0x0Eu8..=0x1Au8).contains(&x) || |
| (0x1Cu8..=0x1Fu8).contains(&x) |
| }) { |
| // Step 5. The computed MIME type is "application/octet-stream". |
| mime::APPLICATION_OCTET_STREAM |
| } else { |
| // Step 4. If the resource header contains no binary data bytes, |
| // the computed MIME type is "text/plain". |
| mime::TEXT_PLAIN |
| } |
| } |
| } |
| impl MIMEChecker for BinaryOrPlaintextClassifier { |
| fn classify(&self, data: &[u8]) -> Option<Mime> { |
| Some(self.classify_impl(data)) |
| } |
| |
| fn validate(&self) -> Result<(), String> { |
| Ok(()) |
| } |
| } |
| struct GroupedClassifier { |
| byte_matchers: Vec<Box<dyn MIMEChecker + Send + Sync>>, |
| } |
| impl GroupedClassifier { |
| fn image_classifer() -> GroupedClassifier { |
| GroupedClassifier { |
| byte_matchers: vec![ |
| // Keep this in sync with 'is_supported_mime_type' from |
| // components/style/servo/media_queries.rs |
| Box::new(ByteMatcher::image_x_icon()), |
| Box::new(ByteMatcher::image_x_icon_cursor()), |
| Box::new(ByteMatcher::image_bmp()), |
| Box::new(ByteMatcher::image_gif89a()), |
| Box::new(ByteMatcher::image_gif87a()), |
| Box::new(ByteMatcher::image_webp()), |
| Box::new(ByteMatcher::image_png()), |
| Box::new(ByteMatcher::image_jpeg()), |
| ], |
| } |
| } |
| fn audio_video_classifier() -> GroupedClassifier { |
| GroupedClassifier { |
| byte_matchers: vec![ |
| Box::new(ByteMatcher::video_webm()), |
| Box::new(ByteMatcher::audio_basic()), |
| Box::new(ByteMatcher::audio_aiff()), |
| Box::new(ByteMatcher::audio_mpeg()), |
| Box::new(ByteMatcher::application_ogg()), |
| Box::new(ByteMatcher::audio_midi()), |
| Box::new(ByteMatcher::video_avi()), |
| Box::new(ByteMatcher::audio_wave()), |
| Box::new(Mp4Matcher), |
| ], |
| } |
| } |
| fn scriptable_classifier() -> GroupedClassifier { |
| GroupedClassifier { |
| byte_matchers: vec![ |
| Box::new(ByteMatcher::text_html_doctype()), |
| Box::new(ByteMatcher::text_html_page()), |
| Box::new(ByteMatcher::text_html_head()), |
| Box::new(ByteMatcher::text_html_script()), |
| Box::new(ByteMatcher::text_html_iframe()), |
| Box::new(ByteMatcher::text_html_h1()), |
| Box::new(ByteMatcher::text_html_div()), |
| Box::new(ByteMatcher::text_html_font()), |
| Box::new(ByteMatcher::text_html_table()), |
| Box::new(ByteMatcher::text_html_a()), |
| Box::new(ByteMatcher::text_html_style()), |
| Box::new(ByteMatcher::text_html_title()), |
| Box::new(ByteMatcher::text_html_b()), |
| Box::new(ByteMatcher::text_html_body()), |
| Box::new(ByteMatcher::text_html_br()), |
| Box::new(ByteMatcher::text_html_p()), |
| Box::new(ByteMatcher::text_html_comment()), |
| Box::new(ByteMatcher::text_xml()), |
| Box::new(ByteMatcher::application_pdf()), |
| ], |
| } |
| } |
| fn plaintext_classifier() -> GroupedClassifier { |
| GroupedClassifier { |
| byte_matchers: vec![ |
| Box::new(ByteMatcher::text_plain_utf_8_bom()), |
| Box::new(ByteMatcher::text_plain_utf_16le_bom()), |
| Box::new(ByteMatcher::text_plain_utf_16be_bom()), |
| Box::new(ByteMatcher::application_postscript()), |
| ], |
| } |
| } |
| fn archive_classifier() -> GroupedClassifier { |
| GroupedClassifier { |
| byte_matchers: vec![ |
| Box::new(ByteMatcher::application_x_gzip()), |
| Box::new(ByteMatcher::application_zip()), |
| Box::new(ByteMatcher::application_x_rar_compressed()), |
| ], |
| } |
| } |
| |
| fn font_classifier() -> GroupedClassifier { |
| GroupedClassifier { |
| byte_matchers: vec![ |
| Box::new(ByteMatcher::application_font_woff()), |
| Box::new(ByteMatcher::true_type_collection()), |
| Box::new(ByteMatcher::open_type()), |
| Box::new(ByteMatcher::true_type()), |
| Box::new(ByteMatcher::application_vnd_ms_font_object()), |
| ], |
| } |
| } |
| } |
| impl MIMEChecker for GroupedClassifier { |
| fn classify(&self, data: &[u8]) -> Option<Mime> { |
| self.byte_matchers |
| .iter() |
| .filter_map(|matcher| matcher.classify(data)) |
| .next() |
| } |
| |
| fn validate(&self) -> Result<(), String> { |
| for byte_matcher in &self.byte_matchers { |
| byte_matcher.validate()? |
| } |
| Ok(()) |
| } |
| } |
| |
| // Contains hard coded byte matchers |
| // TODO: These should be configured and not hard coded |
| impl ByteMatcher { |
| // A Windows Icon signature |
| fn image_x_icon() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"\x00\x00\x01\x00", |
| mask: b"\xFF\xFF\xFF\xFF", |
| content_type: "image/x-icon".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // A Windows Cursor signature. |
| fn image_x_icon_cursor() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"\x00\x00\x02\x00", |
| mask: b"\xFF\xFF\xFF\xFF", |
| content_type: "image/x-icon".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "BM", a BMP signature. |
| fn image_bmp() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"BM", |
| mask: b"\xFF\xFF", |
| content_type: mime::IMAGE_BMP, |
| leading_ignore: &[], |
| } |
| } |
| // The string "GIF89a", a GIF signature. |
| fn image_gif89a() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"GIF89a", |
| mask: b"\xFF\xFF\xFF\xFF\xFF\xFF", |
| content_type: mime::IMAGE_GIF, |
| leading_ignore: &[], |
| } |
| } |
| // The string "GIF87a", a GIF signature. |
| fn image_gif87a() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"GIF87a", |
| mask: b"\xFF\xFF\xFF\xFF\xFF\xFF", |
| content_type: mime::IMAGE_GIF, |
| leading_ignore: &[], |
| } |
| } |
| // The string "RIFF" followed by four bytes followed by the string "WEBPVP". |
| fn image_webp() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"RIFF\x00\x00\x00\x00WEBPVP", |
| mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF", |
| content_type: "image/webp".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // An error-checking byte followed by the string "PNG" followed by CR LF SUB LF, the PNG |
| // signature. |
| fn image_png() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"\x89PNG\r\n\x1A\n", |
| mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", |
| content_type: mime::IMAGE_PNG, |
| leading_ignore: &[], |
| } |
| } |
| // The JPEG Start of Image marker followed by the indicator byte of another marker. |
| fn image_jpeg() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"\xFF\xD8\xFF", |
| mask: b"\xFF\xFF\xFF", |
| content_type: mime::IMAGE_JPEG, |
| leading_ignore: &[], |
| } |
| } |
| // The WebM signature. [TODO: Use more bytes?] |
| fn video_webm() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"\x1A\x45\xDF\xA3", |
| mask: b"\xFF\xFF\xFF\xFF", |
| content_type: "video/webm".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string ".snd", the basic audio signature. |
| fn audio_basic() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b".snd", |
| mask: b"\xFF\xFF\xFF\xFF", |
| content_type: "audio/basic".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "FORM" followed by four bytes followed by the string "AIFF", the AIFF signature. |
| fn audio_aiff() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"FORM\x00\x00\x00\x00AIFF", |
| mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", |
| content_type: "audio/aiff".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "ID3", the ID3v2-tagged MP3 signature. |
| fn audio_mpeg() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"ID3", |
| mask: b"\xFF\xFF\xFF", |
| content_type: "audio/mpeg".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "OggS" followed by NUL, the Ogg container signature. |
| fn application_ogg() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"OggS\x00", |
| mask: b"\xFF\xFF\xFF\xFF\xFF", |
| content_type: "application/ogg".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "MThd" followed by four bytes representing the number 6 in 32 bits (big-endian), |
| // the MIDI signature. |
| fn audio_midi() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"MThd\x00\x00\x00\x06", |
| mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", |
| content_type: "audio/midi".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "RIFF" followed by four bytes followed by the string "AVI ", the AVI signature. |
| fn video_avi() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"RIFF\x00\x00\x00\x00AVI ", |
| mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", |
| content_type: "video/avi".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "RIFF" followed by four bytes followed by the string "WAVE", the WAVE signature. |
| fn audio_wave() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"RIFF\x00\x00\x00\x00WAVE", |
| mask: b"\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF", |
| content_type: "audio/wave".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // doctype terminated with Tag terminating (TT) Byte |
| fn text_html_doctype() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<!DOCTYPE HTML", |
| mask: b"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // HTML terminated with Tag terminating (TT) Byte: 0x20 (SP) |
| fn text_html_page() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<HTML", |
| mask: b"\xFF\xDF\xDF\xDF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // head terminated with Tag Terminating (TT) Byte |
| fn text_html_head() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<HEAD", |
| mask: b"\xFF\xDF\xDF\xDF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // script terminated with Tag Terminating (TT) Byte |
| fn text_html_script() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<SCRIPT", |
| mask: b"\xFF\xDF\xDF\xDF\xDF\xDF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // iframe terminated with Tag Terminating (TT) Byte |
| fn text_html_iframe() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<IFRAME", |
| mask: b"\xFF\xDF\xDF\xDF\xDF\xDF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // h1 terminated with Tag Terminating (TT) Byte |
| fn text_html_h1() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<H1", |
| mask: b"\xFF\xDF\xFF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // div terminated with Tag Terminating (TT) Byte |
| fn text_html_div() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<DIV", |
| mask: b"\xFF\xDF\xDF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // font terminated with Tag Terminating (TT) Byte |
| fn text_html_font() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<FONT", |
| mask: b"\xFF\xDF\xDF\xDF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // table terminated with Tag Terminating (TT) Byte |
| fn text_html_table() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<TABLE", |
| mask: b"\xFF\xDF\xDF\xDF\xDF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // a terminated with Tag Terminating (TT) Byte |
| fn text_html_a() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<A", |
| mask: b"\xFF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // style terminated with Tag Terminating (TT) Byte |
| fn text_html_style() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<STYLE", |
| mask: b"\xFF\xDF\xDF\xDF\xDF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // title terminated with Tag Terminating (TT) Byte |
| fn text_html_title() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<TITLE", |
| mask: b"\xFF\xDF\xDF\xDF\xDF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // b terminated with Tag Terminating (TT) Byte |
| fn text_html_b() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<B", |
| mask: b"\xFF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // body terminated with Tag Terminating (TT) Byte |
| fn text_html_body() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<BODY", |
| mask: b"\xFF\xDF\xDF\xDF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // br terminated with Tag Terminating (TT) Byte |
| fn text_html_br() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<BR", |
| mask: b"\xFF\xDF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // p terminated with Tag Terminating (TT) Byte |
| fn text_html_p() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<P", |
| mask: b"\xFF\xDF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // comment terminated with Tag Terminating (TT) Byte |
| fn text_html_comment() -> TagTerminatedByteMatcher { |
| TagTerminatedByteMatcher { |
| matcher: ByteMatcher { |
| pattern: b"<!--", |
| mask: b"\xFF\xFF\xFF\xFF", |
| content_type: mime::TEXT_HTML, |
| leading_ignore: b"\t\n\x0C\r ", |
| }, |
| } |
| } |
| |
| // The string "<?xml". |
| fn text_xml() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"<?xml", |
| mask: b"\xFF\xFF\xFF\xFF\xFF", |
| content_type: mime::TEXT_XML, |
| leading_ignore: b"\t\n\x0C\r ", |
| } |
| } |
| // The string "%PDF-", the PDF signature. |
| fn application_pdf() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"%PDF-", |
| mask: b"\xFF\xFF\xFF\xFF\xFF", |
| content_type: mime::APPLICATION_PDF, |
| leading_ignore: &[], |
| } |
| } |
| // 34 bytes followed by the string "LP", the Embedded OpenType signature. |
| fn application_vnd_ms_font_object() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ |
| \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ |
| \x00\x00LP", |
| mask: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ |
| \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ |
| \x00\x00\xFF\xFF", |
| content_type: "application/vnd.ms-fontobject".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // 4 bytes representing the version number 1.0, a TrueType signature. |
| fn true_type() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"\x00\x01\x00\x00", |
| mask: b"\xFF\xFF\xFF\xFF", |
| content_type: "application/font-sfnt".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "OTTO", the OpenType signature. |
| fn open_type() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"OTTO", |
| mask: b"\xFF\xFF\xFF\xFF", |
| content_type: "application/font-sfnt".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "ttcf", the TrueType Collection signature. |
| fn true_type_collection() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"ttcf", |
| mask: b"\xFF\xFF\xFF\xFF", |
| content_type: "application/font-sfnt".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "wOFF", the Web Open Font Format signature. |
| fn application_font_woff() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"wOFF", |
| mask: b"\xFF\xFF\xFF\xFF", |
| content_type: "application/font-woff".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The GZIP archive signature. |
| fn application_x_gzip() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"\x1F\x8B\x08", |
| mask: b"\xFF\xFF\xFF", |
| content_type: "application/x-gzip".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "PK" followed by ETX EOT, the ZIP archive signature. |
| fn application_zip() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"PK\x03\x04", |
| mask: b"\xFF\xFF\xFF\xFF", |
| content_type: "application/zip".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "Rar " followed by SUB BEL NUL, the RAR archive signature. |
| fn application_x_rar_compressed() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"Rar \x1A\x07\x00", |
| mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF", |
| content_type: "application/x-rar-compressed".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // The string "%!PS-Adobe-", the PostScript signature. |
| fn application_postscript() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"%!PS-Adobe-", |
| mask: b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", |
| content_type: "application/postscript".parse().unwrap(), |
| leading_ignore: &[], |
| } |
| } |
| // UTF-16BE BOM |
| fn text_plain_utf_16be_bom() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"\xFE\xFF\x00\x00", |
| mask: b"\xFF\xFF\x00\x00", |
| content_type: mime::TEXT_PLAIN, |
| leading_ignore: &[], |
| } |
| } |
| // UTF-16LE BOM |
| fn text_plain_utf_16le_bom() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"\xFF\xFE\x00\x00", |
| mask: b"\xFF\xFF\x00\x00", |
| content_type: mime::TEXT_PLAIN, |
| leading_ignore: &[], |
| } |
| } |
| // UTF-8 BOM |
| fn text_plain_utf_8_bom() -> ByteMatcher { |
| ByteMatcher { |
| pattern: b"\xEF\xBB\xBF\x00", |
| mask: b"\xFF\xFF\xFF\x00", |
| content_type: mime::TEXT_PLAIN, |
| leading_ignore: &[], |
| } |
| } |
| } |