From d2213af48009aa27ce012095835d2a4fad230c4f Mon Sep 17 00:00:00 2001 From: Alexandre Pasmantier Date: Fri, 18 Oct 2024 00:48:37 +0200 Subject: [PATCH] a more sensible method to detect text files --- Cargo.lock | 12 +++++++++++- Cargo.toml | 4 ++-- build.rs | 12 +++++++----- crates/television/channels/files.rs | 17 ++++++++++++++--- crates/television/channels/text.rs | 9 ++++++--- crates/television/cli.rs | 6 ++++-- crates/television/previewers/files.rs | 13 ++++--------- crates/television/utils/strings.rs | 9 ++++++++- 8 files changed, 56 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 224ba6d..99b206b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2153,6 +2153,15 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "0.38.37" @@ -2419,7 +2428,7 @@ dependencies = [ [[package]] name = "television" -version = "0.1.5" +version = "0.1.6" dependencies = [ "anyhow", "better-panic", @@ -2832,6 +2841,7 @@ dependencies = [ "cargo_metadata", "derive_builder", "regex", + "rustc_version", "rustversion", "time", "vergen-lib", diff --git a/Cargo.toml b/Cargo.toml index 2a49d8a..f07a4dd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "television" -version = "0.1.5" +version = "0.1.6" edition = "2021" description = "The revolution will be televised." license = "MIT" @@ -72,7 +72,7 @@ pretty_assertions = "1.4.1" [build-dependencies] anyhow = "1.0.86" -vergen-gix = { version = "1.0.0", features = ["build", "cargo"] } +vergen-gix = { version = "1.0.0", features = ["build", "cargo", "rustc"] } [profile.staging] diff --git a/build.rs b/build.rs index 2461afa..6c8d039 100644 --- a/build.rs +++ b/build.rs @@ -1,13 +1,15 @@ use anyhow::Result; -use vergen_gix::{BuildBuilder, CargoBuilder, Emitter, GixBuilder}; +use vergen_gix::{ + BuildBuilder, CargoBuilder, Emitter, GixBuilder, RustcBuilder, +}; fn main() -> Result<()> { - let build = BuildBuilder::all_build()?; - let gix = GixBuilder::all_git()?; - let cargo = CargoBuilder::all_cargo()?; + let build = BuildBuilder::default().build_date(true).build()?; + let cargo = CargoBuilder::default().target_triple(true).build()?; + let rustc = RustcBuilder::default().semver(true).build()?; Emitter::default() .add_instructions(&build)? - .add_instructions(&gix)? .add_instructions(&cargo)? + .add_instructions(&rustc)? .emit() } diff --git a/crates/television/channels/files.rs b/crates/television/channels/files.rs index 141a9d3..94d118a 100644 --- a/crates/television/channels/files.rs +++ b/crates/television/channels/files.rs @@ -3,15 +3,17 @@ use nucleo::{ pattern::{CaseMatching, Normalization}, Config, Injector, Nucleo, }; -use std::{path::PathBuf, sync::Arc}; +use std::{os::unix::ffi::OsStrExt, path::PathBuf, sync::Arc}; use ignore::DirEntry; use super::TelevisionChannel; -use crate::entry::Entry; -use crate::fuzzy::MATCHER; use crate::previewers::PreviewType; use crate::utils::files::{walk_builder, DEFAULT_NUM_THREADS}; +use crate::{ + entry::Entry, utils::strings::proportion_of_printable_ascii_characters, +}; +use crate::{fuzzy::MATCHER, utils::strings::PRINTABLE_ASCII_THRESHOLD}; pub(crate) struct Channel { matcher: Nucleo, @@ -19,6 +21,8 @@ pub(crate) struct Channel { result_count: u32, total_count: u32, running: bool, + // TODO: cache results (to make deleting characters smoother) but like + // a shallow cache (maybe more like a stack actually? so we just pop result sets) } impl Channel { @@ -131,6 +135,13 @@ async fn load_files(path: PathBuf, injector: Injector) { if let Ok(entry) = result { if entry.file_type().unwrap().is_file() { // Send the path via the async channel + let file_name = entry.file_name(); + if proportion_of_printable_ascii_characters( + file_name.as_bytes(), + ) < PRINTABLE_ASCII_THRESHOLD + { + return ignore::WalkState::Continue; + } let _ = injector.push(entry, |e, cols| { cols[0] = e .path() diff --git a/crates/television/channels/text.rs b/crates/television/channels/text.rs index 9178951..ae4d6dc 100644 --- a/crates/television/channels/text.rs +++ b/crates/television/channels/text.rs @@ -13,13 +13,15 @@ use std::{ use tracing::{debug, info}; use super::TelevisionChannel; -use crate::entry::Entry; -use crate::fuzzy::MATCHER; use crate::previewers::PreviewType; use crate::utils::{ files::{is_not_text, is_valid_utf8, walk_builder, DEFAULT_NUM_THREADS}, strings::preprocess_line, }; +use crate::{ + entry::Entry, utils::strings::proportion_of_printable_ascii_characters, +}; +use crate::{fuzzy::MATCHER, utils::strings::PRINTABLE_ASCII_THRESHOLD}; #[derive(Debug)] struct CandidateLine { @@ -184,7 +186,8 @@ async fn load_candidates(path: PathBuf, injector: Injector) { if (bytes_read == 0) || is_not_text(&buffer) .unwrap_or(false) - || !is_valid_utf8(&buffer) + || proportion_of_printable_ascii_characters(&buffer) + < PRINTABLE_ASCII_THRESHOLD { return ignore::WalkState::Continue; } diff --git a/crates/television/cli.rs b/crates/television/cli.rs index af4e728..3261f82 100644 --- a/crates/television/cli.rs +++ b/crates/television/cli.rs @@ -21,8 +21,10 @@ pub(crate) struct Cli { const VERSION_MESSAGE: &str = concat!( env!("CARGO_PKG_VERSION"), - "-", - env!("VERGEN_GIT_DESCRIBE"), + "\ntarget triple: ", + env!("VERGEN_CARGO_TARGET_TRIPLE"), + "\nbuild: ", + env!("VERGEN_RUSTC_SEMVER"), " (", env!("VERGEN_BUILD_DATE"), ")" diff --git a/crates/television/previewers/files.rs b/crates/television/previewers/files.rs index 500877b..51b5285 100644 --- a/crates/television/previewers/files.rs +++ b/crates/television/previewers/files.rs @@ -17,11 +17,11 @@ use tracing::{debug, warn}; use crate::entry; use crate::previewers::{Preview, PreviewContent}; -use crate::utils::files::is_valid_utf8; use crate::utils::files::FileType; use crate::utils::files::{get_file_size, is_known_text_extension}; use crate::utils::strings::{ preprocess_line, proportion_of_printable_ascii_characters, + PRINTABLE_ASCII_THRESHOLD, }; use super::cache::PreviewCache; @@ -105,7 +105,8 @@ impl FilePreviewer { FileType::Image => { debug!("Previewing image file: {:?}", entry.name); // insert a loading preview into the cache - let preview = loading(&entry.name); + //let preview = loading(&entry.name); + let preview = not_supported(&entry.name); self.cache_preview(entry.name.clone(), preview.clone()) .await; //// compute the image preview in the background @@ -199,9 +200,6 @@ impl FilePreviewer { /// 4 MB const MAX_FILE_SIZE: u64 = 4 * 1024 * 1024; - /// The proportion of printable ascii characters that a file must have to be considered text. - const PRINTABLE_ASCII_THRESHOLD: f32 = 0.9; - fn get_file_type(&self, path: &Path) -> FileType { debug!("Getting file type for {:?}", path); let mut file_type = match infer::get_from_path(path) { @@ -225,12 +223,9 @@ impl FilePreviewer { } else if let Ok(mut f) = File::open(path) { let mut buffer = [0u8; 256]; if let Ok(bytes_read) = f.read(&mut buffer) { - // TODO: add a check for the proportion of non printable characters (binary - // files) if bytes_read > 0 - && is_valid_utf8(&buffer) && proportion_of_printable_ascii_characters(&buffer) - > Self::PRINTABLE_ASCII_THRESHOLD + > PRINTABLE_ASCII_THRESHOLD { file_type = FileType::Text; } diff --git a/crates/television/utils/strings.rs b/crates/television/utils/strings.rs index db5ec42..26ade0c 100644 --- a/crates/television/utils/strings.rs +++ b/crates/television/utils/strings.rs @@ -54,6 +54,7 @@ lazy_static! { pub const EMPTY_STRING: &str = ""; pub const FOUR_SPACES: &str = " "; +pub const TAB_WIDTH: usize = 4; const SPACE_CHARACTER: char = ' '; const TAB_CHARACTER: char = '\t'; @@ -108,6 +109,12 @@ pub(crate) fn replace_nonprintable(input: &[u8], tab_width: usize) -> String { output } +/// The threshold for considering a buffer to be printable ASCII. +/// +/// This is used to determine whether a file is likely to be a text file +/// based on a sample of its contents. +pub const PRINTABLE_ASCII_THRESHOLD: f32 = 0.7; + pub(crate) fn proportion_of_printable_ascii_characters(buffer: &[u8]) -> f32 { let mut printable = 0; for &byte in buffer { @@ -131,7 +138,7 @@ pub(crate) fn preprocess_line(line: &str) -> String { } .trim_end_matches(['\r', '\n', '\0']) .as_bytes(), - 2, + TAB_WIDTH, ) }