Skip to content

Commit

Permalink
a more sensible method to detect text files
Browse files Browse the repository at this point in the history
  • Loading branch information
alexpasmantier committed Oct 17, 2024
1 parent 49a3948 commit d2213af
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 26 deletions.
12 changes: 11 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "television"
version = "0.1.5"
version = "0.1.6"
edition = "2021"
description = "The revolution will be televised."
license = "MIT"
Expand Down Expand Up @@ -72,7 +72,7 @@ pretty_assertions = "1.4.1"

[build-dependencies]
anyhow = "1.0.86"
vergen-gix = { version = "1.0.0", features = ["build", "cargo"] }
vergen-gix = { version = "1.0.0", features = ["build", "cargo", "rustc"] }


[profile.staging]
Expand Down
12 changes: 7 additions & 5 deletions build.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
use anyhow::Result;
use vergen_gix::{BuildBuilder, CargoBuilder, Emitter, GixBuilder};
use vergen_gix::{
BuildBuilder, CargoBuilder, Emitter, GixBuilder, RustcBuilder,
};

fn main() -> Result<()> {
let build = BuildBuilder::all_build()?;
let gix = GixBuilder::all_git()?;
let cargo = CargoBuilder::all_cargo()?;
let build = BuildBuilder::default().build_date(true).build()?;
let cargo = CargoBuilder::default().target_triple(true).build()?;
let rustc = RustcBuilder::default().semver(true).build()?;
Emitter::default()
.add_instructions(&build)?
.add_instructions(&gix)?
.add_instructions(&cargo)?
.add_instructions(&rustc)?
.emit()
}
17 changes: 14 additions & 3 deletions crates/television/channels/files.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,26 @@ use nucleo::{
pattern::{CaseMatching, Normalization},
Config, Injector, Nucleo,
};
use std::{path::PathBuf, sync::Arc};
use std::{os::unix::ffi::OsStrExt, path::PathBuf, sync::Arc};

use ignore::DirEntry;

use super::TelevisionChannel;
use crate::entry::Entry;
use crate::fuzzy::MATCHER;
use crate::previewers::PreviewType;
use crate::utils::files::{walk_builder, DEFAULT_NUM_THREADS};
use crate::{
entry::Entry, utils::strings::proportion_of_printable_ascii_characters,
};
use crate::{fuzzy::MATCHER, utils::strings::PRINTABLE_ASCII_THRESHOLD};

pub(crate) struct Channel {
matcher: Nucleo<DirEntry>,
last_pattern: String,
result_count: u32,
total_count: u32,
running: bool,
// TODO: cache results (to make deleting characters smoother) but like
// a shallow cache (maybe more like a stack actually? so we just pop result sets)
}

impl Channel {
Expand Down Expand Up @@ -131,6 +135,13 @@ async fn load_files(path: PathBuf, injector: Injector<DirEntry>) {
if let Ok(entry) = result {
if entry.file_type().unwrap().is_file() {
// Send the path via the async channel
let file_name = entry.file_name();
if proportion_of_printable_ascii_characters(
file_name.as_bytes(),
) < PRINTABLE_ASCII_THRESHOLD
{
return ignore::WalkState::Continue;
}
let _ = injector.push(entry, |e, cols| {
cols[0] = e
.path()
Expand Down
9 changes: 6 additions & 3 deletions crates/television/channels/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ use std::{
use tracing::{debug, info};

use super::TelevisionChannel;
use crate::entry::Entry;
use crate::fuzzy::MATCHER;
use crate::previewers::PreviewType;
use crate::utils::{
files::{is_not_text, is_valid_utf8, walk_builder, DEFAULT_NUM_THREADS},
strings::preprocess_line,
};
use crate::{
entry::Entry, utils::strings::proportion_of_printable_ascii_characters,
};
use crate::{fuzzy::MATCHER, utils::strings::PRINTABLE_ASCII_THRESHOLD};

#[derive(Debug)]
struct CandidateLine {
Expand Down Expand Up @@ -184,7 +186,8 @@ async fn load_candidates(path: PathBuf, injector: Injector<CandidateLine>) {
if (bytes_read == 0)
|| is_not_text(&buffer)
.unwrap_or(false)
|| !is_valid_utf8(&buffer)
|| proportion_of_printable_ascii_characters(&buffer)
< PRINTABLE_ASCII_THRESHOLD
{
return ignore::WalkState::Continue;
}
Expand Down
6 changes: 4 additions & 2 deletions crates/television/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ pub(crate) struct Cli {

const VERSION_MESSAGE: &str = concat!(
env!("CARGO_PKG_VERSION"),
"-",
env!("VERGEN_GIT_DESCRIBE"),
"\ntarget triple: ",
env!("VERGEN_CARGO_TARGET_TRIPLE"),
"\nbuild: ",
env!("VERGEN_RUSTC_SEMVER"),
" (",
env!("VERGEN_BUILD_DATE"),
")"
Expand Down
13 changes: 4 additions & 9 deletions crates/television/previewers/files.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ use tracing::{debug, warn};

use crate::entry;
use crate::previewers::{Preview, PreviewContent};
use crate::utils::files::is_valid_utf8;
use crate::utils::files::FileType;
use crate::utils::files::{get_file_size, is_known_text_extension};
use crate::utils::strings::{
preprocess_line, proportion_of_printable_ascii_characters,
PRINTABLE_ASCII_THRESHOLD,
};

use super::cache::PreviewCache;
Expand Down Expand Up @@ -105,7 +105,8 @@ impl FilePreviewer {
FileType::Image => {
debug!("Previewing image file: {:?}", entry.name);
// insert a loading preview into the cache
let preview = loading(&entry.name);
//let preview = loading(&entry.name);
let preview = not_supported(&entry.name);
self.cache_preview(entry.name.clone(), preview.clone())
.await;
//// compute the image preview in the background
Expand Down Expand Up @@ -199,9 +200,6 @@ impl FilePreviewer {
/// 4 MB
const MAX_FILE_SIZE: u64 = 4 * 1024 * 1024;

/// The proportion of printable ascii characters that a file must have to be considered text.
const PRINTABLE_ASCII_THRESHOLD: f32 = 0.9;

fn get_file_type(&self, path: &Path) -> FileType {
debug!("Getting file type for {:?}", path);
let mut file_type = match infer::get_from_path(path) {
Expand All @@ -225,12 +223,9 @@ impl FilePreviewer {
} else if let Ok(mut f) = File::open(path) {
let mut buffer = [0u8; 256];
if let Ok(bytes_read) = f.read(&mut buffer) {
// TODO: add a check for the proportion of non printable characters (binary
// files)
if bytes_read > 0
&& is_valid_utf8(&buffer)
&& proportion_of_printable_ascii_characters(&buffer)
> Self::PRINTABLE_ASCII_THRESHOLD
> PRINTABLE_ASCII_THRESHOLD
{
file_type = FileType::Text;
}
Expand Down
9 changes: 8 additions & 1 deletion crates/television/utils/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ lazy_static! {

pub const EMPTY_STRING: &str = "";
pub const FOUR_SPACES: &str = " ";
pub const TAB_WIDTH: usize = 4;

const SPACE_CHARACTER: char = ' ';
const TAB_CHARACTER: char = '\t';
Expand Down Expand Up @@ -108,6 +109,12 @@ pub(crate) fn replace_nonprintable(input: &[u8], tab_width: usize) -> String {
output
}

/// The threshold for considering a buffer to be printable ASCII.
///
/// This is used to determine whether a file is likely to be a text file
/// based on a sample of its contents.
pub const PRINTABLE_ASCII_THRESHOLD: f32 = 0.7;

pub(crate) fn proportion_of_printable_ascii_characters(buffer: &[u8]) -> f32 {
let mut printable = 0;
for &byte in buffer {
Expand All @@ -131,7 +138,7 @@ pub(crate) fn preprocess_line(line: &str) -> String {
}
.trim_end_matches(['\r', '\n', '\0'])
.as_bytes(),
2,
TAB_WIDTH,
)
}

Expand Down

0 comments on commit d2213af

Please sign in to comment.