#![allow(clippy::trivial_regex)]

use regex::bytes::Regex;
use regex::bytes::RegexSet;

use std::fs;
use std::fs::File;
use std::io::prelude::*;
use std::path::Path;

use std::os::unix::fs::FileTypeExt;

use unicode_bom::Bom;

use anyhow::Result;

//
// File signatures links
// - https://asecuritysite.com/forensics/magic
// - https://filesignatures.net/
// - https://github.com/7h3rAm/cigma/blob/master/cigma/magicbytes.json

#[derive(Debug, PartialEq, Eq)]
pub enum LineEnding {
    Lf,
    Cr,
    Crlf,
    Mixed(usize, usize, usize),
}

#[derive(Debug, PartialEq, Eq)]
pub enum Mimetype {
    Binary,
    Script(LineEnding),
    Pdf,
    Archive,
    Zip,
    Text(LineEnding),
    Data,
    Unknown,
    BlockDevice,
    CharDevice,
    Directory,
    Symlink,
    Fifo,
    Socket,
    Zerofile,
    VeryShort,
    Bom(Bom),
}

pub struct Filetype {
    buffer: Vec<u8>,
}

fn is_binary_data(vec: &[u8], len: usize) -> bool {
    for v in vec.iter().take(len) {
        if *v <= 8 {
            return true;
        }
    }

    false
}

fn _is_crlf(buffer: &[u8], len: usize) -> bool {
    let mut cr = 0;
    let mut lf = 0;

    const CR: u8 = 0x0d; // 13
    const LF: u8 = 0x0a; // 10

    for c in buffer.iter().take(len) {
        if *c == LF {
            lf += 1;
        } else if *c == CR {
            cr += 1;
        }
    }

    let diff: i32 = cr - lf;
    if cr > 0 && diff == 0 {
        return true;
    }

    //println!("cr: {}, lf: {}", cr, lf);
    // Heuristics: we accept if only a few lines are not Crlf
    match (cr, lf) {
        (0, _lf) => return false,
        (_cr, 0) => return true,
        (cr, _lf) => {
            if cr > 500 && diff.abs() < 3 {
                return true;
            }
        }
    }

    false
}

fn is_crlf(buffer: &[u8], len: usize) -> LineEnding {
    let mut seen_cr = false;
    let mut n_crlf = 0;
    let mut n_lf = 0;
    let mut n_cr = 0;

    const CR: u8 = 0x0d; // CR 0x0D 13 \r
    const LF: u8 = 0x0a; // LF 0x0A 10 \n

    for c in buffer.iter().take(len) {
        if *c == LF {
            if seen_cr {
                n_crlf += 1;
            } else {
                n_lf += 1;
            }
        } else if seen_cr {
            n_cr += 1;
        }

        seen_cr = *c == CR;
    }

    // println!("Lf / Cr / Crlf: {} / {} / {}", n_lf, n_cr, n_crlf);

    //    println!("cr: {}, lf: {}, crlf: {}", n_cr, n_lf, n_crlf);
    //  if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)
    //  --> no line terminators

    match (n_cr, n_lf, n_crlf) {
        (0, 0, z) if z > 0 => LineEnding::Crlf,
        (x, 0, 0) if x > 0 => LineEnding::Cr,
        (0, y, 0) if y > 0 => LineEnding::Lf,
        (x, y, z) => LineEnding::Mixed(x, y, z),
    }
}

impl Filetype {
    pub fn new() -> Self {
        Filetype {
            buffer: vec![0; 1024 * 1024],
        }
    }

    pub fn analyze(&mut self, fname: &str) -> Result<Mimetype> {
        // Result<Err,Mimetype> {
        let path = Path::new(fname);

        if let Some(ft) = get_filetype(path) {
            return Ok(ft);
        }

        let metadata = fs::symlink_metadata(fname)?;
        let file_length: usize = metadata.len() as usize;

        if file_length == 0 {
            return Ok(Mimetype::Zerofile);
        }

        if metadata.len() == 1 {
            return Ok(Mimetype::VeryShort);
        }

        let mut hdl_in = File::open(path)?;

        let mut bytes_read: usize = hdl_in.read(&mut self.buffer[0..262])?;

        // PostScript signatures
        // - %!PS-Adobe-1.0, %!PS-Adobe-2.0, %!PS-Adobe-3.0, %!PS-Adobe-3.1
        // - %! and a line feed
        if bytes_read >= 4 && &self.buffer[0..4] == b"%!PS" {
            return Ok(Mimetype::Data);
        }

        // - %!\r\n%%BoundingBox:
        let re: Regex = Regex::new(r"^(?-u)%!(\x0d\x0a|\x0A)%%BoundingBox").unwrap();
        if bytes_read >= 20 && re.is_match(&self.buffer) {
            return Ok(Mimetype::Data);
        }

        if bytes_read >= 4 && &self.buffer[0..4] == b"%PDF" {
            return Ok(Mimetype::Pdf);
        }

        // rtf document
        if bytes_read >= 6 && &self.buffer[0..6] == b"\x7B\x5C\x72\x74\x66\x31" {
            return Ok(Mimetype::Data);
        }

        // ZOO archive  http://fileformats.archiveteam.org/wiki/ZOO
        if bytes_read >= 60 && &self.buffer[20..24] == b"\xDC\xA7\xC4\xFD" {
            return Ok(Mimetype::Archive);
        }

        let bom: Bom = Bom::from(&self.buffer[0..]);

        if bom != Bom::Null {
            return Ok(Mimetype::Bom(bom));
        }

        if is_binary_data(&self.buffer, bytes_read) {
            match analyze_binary(&self.buffer) {
                Some(Mimetype::Zip) => {
                    if fname.ends_with(".cdy") {
                        return Ok(Mimetype::Data);
                    } else {
                        return Ok(Mimetype::Zip);
                    }
                }
                Some(mt) => return Ok(mt),
                None => return Ok(Mimetype::Unknown),
            }
        }

        // https://en.wikipedia.org/wiki/BinHex
        if bytes_read >= 200
            && self
                .buffer
                .starts_with(b"(This file must be converted with BinHex 4.0)")
        {
            return Ok(Mimetype::Binary);
        }

        if bytes_read < file_length {
            if let Ok(rb) = hdl_in.read(&mut self.buffer[262..]) {
                bytes_read += rb
            }
        }

        //println!("Filename: {}", fname);

        let crlf = is_crlf(&self.buffer, bytes_read);
        //println!("{:?}", crlf);

        // checks for
        // - shebang which either starts with `!# ` or `!#/`
        // - php indicator
        if bytes_read >= 5
            && (self.buffer.starts_with(b"#! ")
                || self.buffer.starts_with(b"#!/")
                || self.buffer.starts_with(b"<?php"))
        {
            return Ok(Mimetype::Script(crlf));
        }

        Ok(Mimetype::Text(crlf))
        // match (crlf, is_script) {
        //     (LineEnding::Lf, false) => Ok(Mimetype::Text(LineEnding::Lf)),
        //     (LineEnding::Cr, false) => Ok(Mimetype::Text(LineEnding::Cr)),
        //     (LineEnding::Crlf, false) => Ok(Mimetype::Text(LineEnding::Crlf)),
        //     (LineEnding::Lf, true) => Ok(Mimetype::Script(LineEnding::Lf)),
        //     (LineEnding::Cr, true) => Ok(Mimetype::Script(LineEnding::Cr)),
        //     (LineEnding::Crlf, true) => Ok(Mimetype::Script(LineEnding::Crlf)),
        //     (_, _) => Ok(Mimetype::Text(LineEnding::Lf)),
        // }
    }
}

// https://en.wikipedia.org/wiki/Executable_and_Linkable_Format
// https://en.wikipedia.org/wiki/Mach-O
fn is_binary(vec: &[u8]) -> Option<Mimetype> {
    let binary_re: RegexSet = RegexSet::new([
        r"^(?-u)\x7FELF[\x01\x02][\x01\x02]\x01[\x00-\x11]", // Executable and Linkable Format (ELF)
        r"^(?-u)\x00\x00\x03\xF3", // AmigaOS loadseg()ble executable/binary
        r"^(?-u)MZ", // DOS MZ executable file format and its descendants (including NE and PE)
        r"^(?-u)\x64 \x65\x78\x0A\x30\x33\x35\x00", // Dalvik's executable
        r"^(?-u)#[!]", // script executable
        r"^(?-u)\xCA\xFE\xBA\xBE", // Mach-O binary universal header
        // \xCE\xFA\xED\xFE or  \xCF)\xFA\xED\xFE
        r"^(?-u)(\xCE|\xCF)\xFA\xED\xFE", // Mach-O binary
        r"^(?-u)\x1B\x4C\x75\x61",        // Lua bytecode
    ])
    .unwrap();

    if binary_re.is_match(vec) {
        return Some(Mimetype::Binary);
    }
    None
}

// https://github.com/7h3rAm/cigma/blob/master/cigma/magicbytes.json
// https://en.wikipedia.org/wiki/List_of_file_signatures
fn is_archive(vec: &[u8]) -> Option<Mimetype> {
    // we first have to catch zip files with mimetype formats
    //  - opendocument formats
    //  - Word Open XML
    // Those we do not regard as archives
    let special_zip: RegexSet = RegexSet::new([
        r"^(?-u)PK\x03\x04.{20,}\x08\x00\x00\x00mimetypeapplication",
        r"^(?-u)PK\x03\x04\x14\x00\x06\x00", // Word Open XML (.docx)
        r"^(?-u)PK\x03\x04\x14\x00\x08\x00", // Java Jar file
        r"^(?-u)PK\x03\x04\x14\x00\x08\x08", // Java Jar file
        r"^(?-u)PK\x03\x04\x0A.*?META-INF",  // Java Jar file
        r"^(?-u)PK\x03\x04.*?META-INF",      // Java Jar file
        r"^(?-u)PK\x03\x04\x0A.*?\x56\x92\x48\x4F\xEF", // Java Jar file
    ])
    .unwrap();

    if special_zip.is_match(vec) {
        return Some(Mimetype::Data);
    }

    let archive_re: RegexSet = RegexSet::new([
        r"^(?-u)\x37\x7A\xBC\xAF\x27\x1C",     // 7zip
        r"^(?-u)\x1f\x8B",                     // gzip (.gz)
        r"^(?-u)\x1f\x9D",                     // LZW (.tar.Z)
        r"^(?-u)\x1f\xA0",                     // LZH (.tar.Z)
        r"^(?-u)\xFD\x37\x7A\x58\x5A\x00\x00", // XZ comp. utility using LZMA2 compression (.xz)
        r"^(?-u)\x4D\x53\x43\x46",             // Microsoft cabinet (.cab)
        r"^(?-u)\x42\x5A\x68",                 // bzip2
        r"^(?-u)\x5A\x57\x53",                 // lzma
        r"^(?-u)\x5D\x00\x00(\x01|\x02|\x04|\x08|\x10|\x20|\x40|\x80)\x00", // lzma
        r"^(?-u)\x5D\x00\x00\x00\x01",         // lzma
        r"^(?-u)(SIT!|SITD|STi0|StuffIt)",     // SIT / stuffit (macintosh related)
        r"^(?-u)\x4D\x5A", // DOS MZ executable format, but found in zip archives
        r"^(?-u)\x52\x61\x72\x21\x1A\x07\x00", // RAR archive version 1.50 onwards
        r"^(?-u)\x52\x61\x72\x21\x1A\x07\x01\x00", // RAR archive version 5.0 onwards
        // https://en.wikipedia.org/wiki/LHA_(file_format)
        r"^(?-u)..-lh[0124567d]",                  // LHarc (canonical LZH)
        r"^(?-u)..-lh[89abce]",                    // LHarc (Joe Jared extensions)
        r"^(?-u)..-lhx",                           // LHarc (UNLHA32 extensions)
        r"^(?-u)..-(pc1|pm0|pm1|pm2|pms)",         // LHarc (PMarc extensions)
        r"^(?-u)..-lz[s234578]",                   // LHarc (LArc extensions)
        r"^(?-u)\x53\x5a\x44\x44\x88\xf0\x27\x33", // RAR archive version 5.0 onwards
    ])
    .unwrap();

    if archive_re.is_match(vec) {
        return Some(Mimetype::Archive);
    }

    let archive_re: RegexSet = RegexSet::new([
        r"^(?-u)PK(\x03\x04|\x4c\x49\x54\x45|\x30\x30\x50|\x05\x06|\x07\x08)", // zip archive
    ])
    .unwrap();
    if archive_re.is_match(vec) {
        return Some(Mimetype::Zip);
    }

    None
}

fn analyze_binary(vec: &[u8]) -> Option<Mimetype> {
    let rc = is_binary(vec);
    if rc.is_some() {
        return rc;
    }

    let rc = is_archive(vec);
    if rc.is_some() {
        return rc;
    }

    Some(Mimetype::Data)
}

fn get_filetype(entry: &Path) -> Option<Mimetype> {
    match entry.symlink_metadata() {
        Ok(mt) => {
            let ft = mt.file_type();
            if ft.is_symlink() {
                return Some(Mimetype::Symlink);
            }
            if ft.is_dir() {
                return Some(Mimetype::Directory);
            }
            if ft.is_block_device() {
                return Some(Mimetype::BlockDevice);
            }
            if ft.is_char_device() {
                return Some(Mimetype::CharDevice);
            }
            if ft.is_fifo() {
                return Some(Mimetype::Fifo);
            }
            if ft.is_socket() {
                return Some(Mimetype::Socket);
            }
            None
        }
        Err(_e) => None,
    }
}

#[test]
fn test_filetype() {
    let mut ft = Filetype::new();

    assert!(ft.analyze("tests_filemagic/zerofile").ok() == Some(Mimetype::Zerofile));
    assert!(ft.analyze("tests_filemagic/a_small_file").ok() == Some(Mimetype::VeryShort));
    assert!(ft.analyze("/dev/null").ok() == Some(Mimetype::CharDevice));
    assert!(ft.analyze("tests_filemagic/").ok() == Some(Mimetype::Directory));
    assert!(ft.analyze("tests_filemagic/zerofile_symlink").ok() == Some(Mimetype::Symlink));

    assert!(ft.analyze("tests_filemagic/some.pdf").ok() == Some(Mimetype::Pdf));

    // This file is a pdf but has lines starting with % before the pdf signature shows up
    // The unix `file` command) says: data
    // analyze() says TextCrlf
    //assert!(ft.analyze("tests_filemagic/musterlogo.pdf").ok()           == Some(Mimetype::Script));

    assert!(ft.analyze("tests_filemagic/x.pl").ok() == Some(Mimetype::Script(LineEnding::Lf)));
    assert!(ft.analyze("tests_filemagic/main.php").ok() == Some(Mimetype::Script(LineEnding::Lf)));

    assert!(ft.analyze("tests_filemagic/test.7z").ok() == Some(Mimetype::Archive));
    assert!(ft.analyze("tests_filemagic/x.tgz").ok() == Some(Mimetype::Archive));
    assert!(ft.analyze("tests_filemagic/test.pdf.xz").ok() == Some(Mimetype::Archive));
    assert!(ft.analyze("tests_filemagic/swebib.cab").ok() == Some(Mimetype::Archive));
    assert!(ft.analyze("tests_filemagic/test.tar.bz2").ok() == Some(Mimetype::Archive));
    assert!(ft.analyze("tests_filemagic/PIE.rar").ok() == Some(Mimetype::Archive));
    assert!(ft.analyze("tests_filemagic/infozip-os390.tar.Z").ok() == Some(Mimetype::Archive));
    assert!(ft.analyze("tests_filemagic/bla.lha").ok() == Some(Mimetype::Archive));
    assert!(ft.analyze("tests_filemagic/dvi.zoo").ok() == Some(Mimetype::Archive));
    assert!(ft.analyze("tests_filemagic/rsfs-oztex.sit").ok() == Some(Mimetype::Archive));

    assert!(ft.analyze("tests_filemagic/empty.zip").ok() == Some(Mimetype::Zip));

    assert!(
        ft.analyze("tests_filemagic/README").ok()
            == Some(Mimetype::Text(LineEnding::Mixed(0, 0, 0)))
    );
    //    assert!(ft.analyze("tests_filemagic/README1").ok()                  == Some(Mimetype::Text));

    assert!(ft.analyze("tests_filemagic/cp").ok() == Some(Mimetype::Binary));
    assert!(ft.analyze("tests_filemagic/cheq-f.sit-hqx").ok() == Some(Mimetype::Binary));
    assert!(ft.analyze("tests_filemagic/MuchMore").ok() == Some(Mimetype::Binary));

    assert!(ft.analyze("tests_filemagic/support.ps").ok() == Some(Mimetype::Data));
    assert!(ft.analyze("tests_filemagic/rosette.eps").ok() == Some(Mimetype::Data));
    assert!(ft.analyze("tests_filemagic/eutest.ps").ok() == Some(Mimetype::Data));
    //    assert!(ft.analyze("tests_filemagic/NORMAL.PS").ok()                == Some(Mimetype::Data));
    assert!(ft.analyze("tests_filemagic/chap5.rtf").ok() == Some(Mimetype::Data));
    assert!(ft.analyze("tests_filemagic/commons-math.jar").ok() == Some(Mimetype::Data));

    assert!(
        ft.analyze("tests_filemagic/8stbu11h.htm").ok()
            == Some(Mimetype::Text(LineEnding::Mixed(0, 1, 8710)))
    );
}
