#![allow(clippy::trivial_regex)] use regex::bytes::Regex; use regex::bytes::RegexSet; use std::fs; use std::fs::File; use std::io; use std::io::prelude::*; use std::path::Path; use std::os::unix::fs::FileTypeExt; use unicode_bom::Bom; // // File signatures links // - https://asecuritysite.com/forensics/magic // - https://filesignatures.net/ // - https://github.com/7h3rAm/cigma/blob/master/cigma/magicbytes.json #[derive(Debug, PartialEq, Eq)] pub enum LineEnding { Lf, Cr, Crlf, Mixed(usize, usize, usize), } #[derive(Debug, PartialEq, Eq)] pub enum Mimetype { Binary, Script(LineEnding), Pdf, Archive, Zip, Text(LineEnding), Data, Unknown, BlockDevice, CharDevice, Directory, Symlink, Fifo, Socket, Zerofile, VeryShort, Bom(Bom), } pub struct Filetype { buffer: Vec, } fn is_binary_data(vec: &[u8], len: usize) -> bool { for v in vec.iter().take(len) { if *v <= 8 { return true; } } false } fn _is_crlf(buffer: &[u8], len: usize) -> bool { let mut cr = 0; let mut lf = 0; const CR: u8 = 0x0d; // 13 const LF: u8 = 0x0a; // 10 for c in buffer.iter().take(len) { if *c == LF { lf += 1; } else if *c == CR { cr += 1; } } let diff: i32 = cr - lf; if cr > 0 && diff == 0 { return true; } //println!("cr: {}, lf: {}", cr, lf); // Heuristics: we accept if only a few lines are not Crlf match (cr, lf) { (0, _lf) => return false, (_cr, 0) => return true, (cr, _lf) => { if cr > 500 && diff.abs() < 3 { return true; } } } false } fn is_crlf(buffer: &[u8], len: usize) -> LineEnding { let mut seen_cr = false; let mut n_crlf = 0; let mut n_lf = 0; let mut n_cr = 0; const CR: u8 = 0x0d; // CR 0x0D 13 \r const LF: u8 = 0x0a; // LF 0x0A 10 \n for c in buffer.iter().take(len) { if *c == LF { if seen_cr { n_crlf += 1; } else { n_lf += 1; } } else if seen_cr { n_cr += 1; } seen_cr = *c == CR; } // println!("Lf / Cr / Crlf: {} / {} / {}", n_lf, n_cr, n_crlf); // println!("cr: {}, lf: {}, crlf: {}", n_cr, n_lf, n_crlf); // if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) // --> no line terminators match (n_cr, n_lf, n_crlf) { (0, 0, z) if z > 0 => LineEnding::Crlf, (x, 0, 0) if x > 0 => LineEnding::Cr, (0, y, 0) if y > 0 => LineEnding::Lf, (x, y, z) => LineEnding::Mixed(x, y, z), } } impl Filetype { pub fn new() -> Self { Filetype { buffer: vec![0; 1024 * 1024], } } pub fn analyze(&mut self, fname: &str) -> Result { // Result { let path = Path::new(fname); if let Some(ft) = get_filetype(path) { return Ok(ft); } let metadata = fs::symlink_metadata(fname)?; let file_length: usize = metadata.len() as usize; if file_length == 0 { return Ok(Mimetype::Zerofile); } if metadata.len() == 1 { return Ok(Mimetype::VeryShort); } let mut hdl_in = File::open(path)?; let mut bytes_read: usize = hdl_in.read(&mut self.buffer[0..262])?; // PostScript signatures // - %!PS-Adobe-1.0, %!PS-Adobe-2.0, %!PS-Adobe-3.0, %!PS-Adobe-3.1 // - %! and a line feed if bytes_read >= 4 && &self.buffer[0..4] == b"%!PS" { return Ok(Mimetype::Data); } // - %!\r\n%%BoundingBox: let re: Regex = Regex::new(r"^(?-u)%!(\x0d\x0a|\x0A)%%BoundingBox").unwrap(); if bytes_read >= 20 && re.is_match(&self.buffer) { return Ok(Mimetype::Data); } if bytes_read >= 4 && &self.buffer[0..4] == b"%PDF" { return Ok(Mimetype::Pdf); } // rtf document if bytes_read >= 6 && &self.buffer[0..6] == b"\x7B\x5C\x72\x74\x66\x31" { return Ok(Mimetype::Data); } // ZOO archive http://fileformats.archiveteam.org/wiki/ZOO if bytes_read >= 60 && &self.buffer[20..24] == b"\xDC\xA7\xC4\xFD" { return Ok(Mimetype::Archive); } let bom: Bom = Bom::from(&self.buffer[0..]); if bom != Bom::Null { return Ok(Mimetype::Bom(bom)); } if is_binary_data(&self.buffer, bytes_read) { match analyze_binary(&self.buffer) { Some(Mimetype::Zip) => { if fname.ends_with(".cdy") { return Ok(Mimetype::Data); } else { return Ok(Mimetype::Zip); } } Some(mt) => return Ok(mt), None => return Ok(Mimetype::Unknown), } } // https://en.wikipedia.org/wiki/BinHex if bytes_read >= 200 && self .buffer .starts_with(b"(This file must be converted with BinHex 4.0)") { return Ok(Mimetype::Binary); } if bytes_read < file_length { if let Ok(rb) = hdl_in.read(&mut self.buffer[262..]) { bytes_read += rb } } //println!("Filename: {}", fname); let crlf = is_crlf(&self.buffer, bytes_read); //println!("{:?}", crlf); // checks for // - shebang // - php indicator if bytes_read >= 5 && (self.buffer.starts_with(b"#!") || self.buffer.starts_with(b" Ok(Mimetype::Text(LineEnding::Lf)), // (LineEnding::Cr, false) => Ok(Mimetype::Text(LineEnding::Cr)), // (LineEnding::Crlf, false) => Ok(Mimetype::Text(LineEnding::Crlf)), // (LineEnding::Lf, true) => Ok(Mimetype::Script(LineEnding::Lf)), // (LineEnding::Cr, true) => Ok(Mimetype::Script(LineEnding::Cr)), // (LineEnding::Crlf, true) => Ok(Mimetype::Script(LineEnding::Crlf)), // (_, _) => Ok(Mimetype::Text(LineEnding::Lf)), // } } } // https://en.wikipedia.org/wiki/Executable_and_Linkable_Format fn is_binary(vec: &[u8]) -> Option { let binary_re: RegexSet = RegexSet::new([ r"^(?-u)\x7FELF[\x01\x02][\x01\x02]\x01[\x00-\x11]", // Executable and Linkable Format (ELF) r"^(?-u)\x00\x00\x03\xF3", // AmigaOS loadseg()ble executable/binary r"^(?-u)MZ", // DOS MZ executable file format and its descendants (including NE and PE) r"^(?-u)\x64 \x65\x78\x0A\x30\x33\x35\x00", // Dalvik's executable r"^(?-u)#[!]", // script executable r"^(?-u)(\xCE|\xCF)\xFA\xED\xFE", // Mach-O binary r"^(?-u)\x1B\x4C\x75\x61", // Lua bytecode ]) .unwrap(); if binary_re.is_match(vec) { return Some(Mimetype::Binary); } None } // https://github.com/7h3rAm/cigma/blob/master/cigma/magicbytes.json // https://en.wikipedia.org/wiki/List_of_file_signatures fn is_archive(vec: &[u8]) -> Option { // we first have to catch zip files with mimetype formats // - opendocument formats // - Word Open XML // Those we do not regard as archives let special_zip: RegexSet = RegexSet::new([ r"^(?-u)PK\x03\x04.{20,}\x08\x00\x00\x00mimetypeapplication", r"^(?-u)PK\x03\x04\x14\x00\x06\x00", // Word Open XML (.docx) r"^(?-u)PK\x03\x04\x14\x00\x08\x00", // Java Jar file r"^(?-u)PK\x03\x04\x14\x00\x08\x08", // Java Jar file r"^(?-u)PK\x03\x04\x0A.*?META-INF", // Java Jar file r"^(?-u)PK\x03\x04.*?META-INF", // Java Jar file r"^(?-u)PK\x03\x04\x0A.*?\x56\x92\x48\x4F\xEF", // Java Jar file ]) .unwrap(); if special_zip.is_match(vec) { return Some(Mimetype::Data); } let archive_re: RegexSet = RegexSet::new([ r"^(?-u)\x37\x7A\xBC\xAF\x27\x1C", // 7zip r"^(?-u)\x1f\x8B", // gzip (.gz) r"^(?-u)\x1f\x9D", // LZW (.tar.Z) r"^(?-u)\x1f\xA0", // LZH (.tar.Z) r"^(?-u)\xFD\x37\x7A\x58\x5A\x00\x00", // XZ comp. utility using LZMA2 compression (.xz) r"^(?-u)\x4D\x53\x43\x46", // Microsoft cabinet (.cab) r"^(?-u)\x42\x5A\x68", // bzip2 r"^(?-u)\x5A\x57\x53", // lzma r"^(?-u)\x5D\x00\x00(\x01|\x02|\x04|\x08|\x10|\x20|\x40|\x80)\x00", // lzma r"^(?-u)\x5D\x00\x00\x00\x01", // lzma r"^(?-u)(SIT!|SITD|STi0|StuffIt)", // SIT / stuffit (macintosh related) r"^(?-u)\x4D\x5A", // DOS MZ executable format, but found in zip archives r"^(?-u)\x52\x61\x72\x21\x1A\x07\x00", // RAR archive version 1.50 onwards r"^(?-u)\x52\x61\x72\x21\x1A\x07\x01\x00", // RAR archive version 5.0 onwards // https://en.wikipedia.org/wiki/LHA_(file_format) r"^(?-u)..-lh[0124567d]", // LHarc (canonical LZH) r"^(?-u)..-lh[89abce]", // LHarc (Joe Jared extensions) r"^(?-u)..-lhx", // LHarc (UNLHA32 extensions) r"^(?-u)..-(pc1|pm0|pm1|pm2|pms)", // LHarc (PMarc extensions) r"^(?-u)..-lz[s234578]", // LHarc (LArc extensions) r"^(?-u)\x53\x5a\x44\x44\x88\xf0\x27\x33", // RAR archive version 5.0 onwards ]) .unwrap(); if archive_re.is_match(vec) { return Some(Mimetype::Archive); } let archive_re: RegexSet = RegexSet::new([ r"^(?-u)PK(\x03\x04|\x4c\x49\x54\x45|\x30\x30\x50|\x05\x06|\x07\x08)", // zip archive ]) .unwrap(); if archive_re.is_match(vec) { return Some(Mimetype::Zip); } None } fn analyze_binary(vec: &[u8]) -> Option { let rc = is_binary(vec); if rc.is_some() { return rc; } let rc = is_archive(vec); if rc.is_some() { return rc; } Some(Mimetype::Data) } fn get_filetype(entry: &Path) -> Option { match entry.symlink_metadata() { Ok(mt) => { let ft = mt.file_type(); if ft.is_symlink() { return Some(Mimetype::Symlink); } if ft.is_dir() { return Some(Mimetype::Directory); } if ft.is_block_device() { return Some(Mimetype::BlockDevice); } if ft.is_char_device() { return Some(Mimetype::CharDevice); } if ft.is_fifo() { return Some(Mimetype::Fifo); } if ft.is_socket() { return Some(Mimetype::Socket); } None } Err(_e) => None, } } #[test] fn test_filetype() { let mut ft = Filetype::new(); assert!(ft.analyze("tests_filemagic/zerofile").ok() == Some(Mimetype::Zerofile)); assert!(ft.analyze("tests_filemagic/a_small_file").ok() == Some(Mimetype::VeryShort)); assert!(ft.analyze("/dev/null").ok() == Some(Mimetype::CharDevice)); assert!(ft.analyze("tests_filemagic/").ok() == Some(Mimetype::Directory)); assert!(ft.analyze("tests_filemagic/zerofile_symlink").ok() == Some(Mimetype::Symlink)); assert!(ft.analyze("tests_filemagic/some.pdf").ok() == Some(Mimetype::Pdf)); // This file is a pdf but has lines starting with % before the pdf signature shows up // The unix `file` command) says: data // analyze() says TextCrlf //assert!(ft.analyze("tests_filemagic/musterlogo.pdf").ok() == Some(Mimetype::Script)); assert!(ft.analyze("tests_filemagic/x.pl").ok() == Some(Mimetype::Script(LineEnding::Lf))); assert!(ft.analyze("tests_filemagic/main.php").ok() == Some(Mimetype::Script(LineEnding::Lf))); assert!(ft.analyze("tests_filemagic/test.7z").ok() == Some(Mimetype::Archive)); assert!(ft.analyze("tests_filemagic/x.tgz").ok() == Some(Mimetype::Archive)); assert!(ft.analyze("tests_filemagic/test.pdf.xz").ok() == Some(Mimetype::Archive)); assert!(ft.analyze("tests_filemagic/swebib.cab").ok() == Some(Mimetype::Archive)); assert!(ft.analyze("tests_filemagic/test.tar.bz2").ok() == Some(Mimetype::Archive)); assert!(ft.analyze("tests_filemagic/PIE.rar").ok() == Some(Mimetype::Archive)); assert!(ft.analyze("tests_filemagic/infozip-os390.tar.Z").ok() == Some(Mimetype::Archive)); assert!(ft.analyze("tests_filemagic/bla.lha").ok() == Some(Mimetype::Archive)); assert!(ft.analyze("tests_filemagic/dvi.zoo").ok() == Some(Mimetype::Archive)); assert!(ft.analyze("tests_filemagic/rsfs-oztex.sit").ok() == Some(Mimetype::Archive)); assert!(ft.analyze("tests_filemagic/empty.zip").ok() == Some(Mimetype::Zip)); assert!( ft.analyze("tests_filemagic/README").ok() == Some(Mimetype::Text(LineEnding::Mixed(0, 0, 0))) ); // assert!(ft.analyze("tests_filemagic/README1").ok() == Some(Mimetype::Text)); assert!(ft.analyze("tests_filemagic/cp").ok() == Some(Mimetype::Binary)); assert!(ft.analyze("tests_filemagic/cheq-f.sit-hqx").ok() == Some(Mimetype::Binary)); assert!(ft.analyze("tests_filemagic/MuchMore").ok() == Some(Mimetype::Binary)); assert!(ft.analyze("tests_filemagic/support.ps").ok() == Some(Mimetype::Data)); assert!(ft.analyze("tests_filemagic/rosette.eps").ok() == Some(Mimetype::Data)); assert!(ft.analyze("tests_filemagic/eutest.ps").ok() == Some(Mimetype::Data)); // assert!(ft.analyze("tests_filemagic/NORMAL.PS").ok() == Some(Mimetype::Data)); assert!(ft.analyze("tests_filemagic/chap5.rtf").ok() == Some(Mimetype::Data)); assert!(ft.analyze("tests_filemagic/commons-math.jar").ok() == Some(Mimetype::Data)); assert!( ft.analyze("tests_filemagic/8stbu11h.htm").ok() == Some(Mimetype::Text(LineEnding::Mixed(0, 1, 8710))) ); }