Skip to content

Commit

Permalink
Gf2
Browse files Browse the repository at this point in the history
  • Loading branch information
cmdcolin committed Dec 12, 2024
1 parent 40a4b14 commit 9e5a27d
Show file tree
Hide file tree
Showing 8 changed files with 88 additions and 178 deletions.
9 changes: 3 additions & 6 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
],
"dependencies": {
"@gmod/abortable-promise-cache": "^2.0.0",
"@gmod/bgzf-filehandle": "^1.3.3",
"generic-filehandle": "^3.0.0",
"@gmod/bgzf-filehandle": "^2.0.0",
"generic-filehandle2": "^0.0.1",
"long": "^4.0.0",
"quick-lru": "^4.0.0"
},
Expand All @@ -55,16 +55,13 @@
"@typescript-eslint/eslint-plugin": "^8.0.1",
"@typescript-eslint/parser": "^8.0.1",
"@vitest/coverage-v8": "^2.0.5",
"buffer": "^6.0.3",
"documentation": "^14.0.3",
"eslint": "^9.9.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-prettier": "^5.0.1",
"eslint-plugin-unicorn": "^56.0.0",
"prettier": "^3.3.3",
"rimraf": "^6.0.1",
"standard-changelog": "^6.0.0",
"typescript": "~5.6.0",
"typescript": "^5.7.0",
"typescript-eslint": "^8.0.1",
"vitest": "^2.0.5",
"webpack": "^5.93.0",
Expand Down
63 changes: 35 additions & 28 deletions src/csi.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,26 +49,27 @@ export default class CSI extends IndexFile {
throw new Error('CSI indexes do not support indexcov')
}

parseAuxData(bytes: Buffer, offset: number) {
const formatFlags = bytes.readInt32LE(offset)
parseAuxData(bytes: Uint8Array, offset: number) {
const dataView = new DataView(bytes.buffer)
const formatFlags = dataView.getInt32(offset, true)
const coordinateType =
formatFlags & 0x10000 ? 'zero-based-half-open' : '1-based-closed'
const format = { 0: 'generic', 1: 'SAM', 2: 'VCF' }[formatFlags & 0xf]
if (!format) {
throw new Error(`invalid Tabix preset format flags ${formatFlags}`)
}
const columnNumbers = {
ref: bytes.readInt32LE(offset + 4),
start: bytes.readInt32LE(offset + 8),
end: bytes.readInt32LE(offset + 12),
ref: dataView.getInt32(offset + 4, true),
start: dataView.getInt32(offset + 8, true),
end: dataView.getInt32(offset + 12, true),
}
const metaValue = bytes.readInt32LE(offset + 16)
const metaValue = dataView.getInt32(offset + 16, true)
const metaChar = metaValue ? String.fromCharCode(metaValue) : null
const skipLines = bytes.readInt32LE(offset + 20)
const nameSectionLength = bytes.readInt32LE(offset + 24)
const skipLines = dataView.getInt32(offset + 20, true)
const nameSectionLength = dataView.getInt32(offset + 24, true)

const { refIdToName, refNameToId } = this._parseNameBytes(
bytes.slice(offset + 28, offset + 28 + nameSectionLength),
bytes.subarray(offset + 28, offset + 28 + nameSectionLength),
)

return {
Expand All @@ -82,15 +83,16 @@ export default class CSI extends IndexFile {
}
}

_parseNameBytes(namesBytes: Buffer) {
_parseNameBytes(namesBytes: Uint8Array) {
let currRefId = 0
let currNameStart = 0
const refIdToName = []
const refNameToId: Record<string, number> = {}
const decoder = new TextDecoder('utf8')
for (let i = 0; i < namesBytes.length; i += 1) {
if (!namesBytes[i]) {
if (currNameStart < i) {
let refName = namesBytes.toString('utf8', currNameStart, i)
let refName = decoder.decode(namesBytes.subarray(currNameStart, i))
refName = this.renameRefSeq(refName)
refIdToName[currRefId] = refName
refNameToId[refName] = currRefId
Expand All @@ -99,30 +101,34 @@ export default class CSI extends IndexFile {
currRefId += 1
}
}
return { refNameToId, refIdToName }
return {
refNameToId,
refIdToName,
}
}

// fetch and parse the index

async _parse(opts: Options = {}) {
const bytes = await unzip(await this.filehandle.readFile(opts))
const dataView = new DataView(bytes.buffer)

// check TBI magic numbers
let csiVersion
if (bytes.readUInt32LE(0) === CSI1_MAGIC) {
if (dataView.getUint32(0, true) === CSI1_MAGIC) {
csiVersion = 1
} else if (bytes.readUInt32LE(0) === CSI2_MAGIC) {
} else if (dataView.getUint32(0, true) === CSI2_MAGIC) {
csiVersion = 2
} else {
throw new Error('Not a CSI file')
// TODO: do we need to support big-endian CSI files?
}

this.minShift = bytes.readInt32LE(4)
this.depth = bytes.readInt32LE(8)
this.minShift = dataView.getInt32(4, true)
this.depth = dataView.getInt32(8, true)
this.maxBinNumber = ((1 << ((this.depth + 1) * 3)) - 1) / 7
const maxRefLength = 2 ** (this.minShift + this.depth * 3)
const auxLength = bytes.readInt32LE(12)
const auxLength = dataView.getInt32(12, true)
const aux =
auxLength && auxLength >= 30
? this.parseAuxData(bytes, 16)
Expand All @@ -134,19 +140,19 @@ export default class CSI extends IndexFile {
coordinateType: 'zero-based-half-open',
format: 'generic',
}
const refCount = bytes.readInt32LE(16 + auxLength)
const refCount = dataView.getInt32(16 + auxLength, true)

// read the indexes for each reference sequence
let firstDataLine: VirtualOffset | undefined
let currOffset = 16 + auxLength + 4
const indices = new Array(refCount).fill(0).map(() => {
// the binning index
const binCount = bytes.readInt32LE(currOffset)
const binCount = dataView.getInt32(currOffset, true)
currOffset += 4
const binIndex: Record<string, Chunk[]> = {}
let stats // < provided by parsing a pseudo-bin, if present
for (let j = 0; j < binCount; j += 1) {
const bin = bytes.readUInt32LE(currOffset)
const bin = dataView.getUint32(currOffset, true)
if (bin > this.maxBinNumber) {
// this is a fake bin that actually has stats information
// about the reference sequence in it
Expand All @@ -155,7 +161,7 @@ export default class CSI extends IndexFile {
} else {
const loffset = fromBytes(bytes, currOffset + 4)
firstDataLine = this._findFirstData(firstDataLine, loffset)
const chunkCount = bytes.readInt32LE(currOffset + 12)
const chunkCount = dataView.getInt32(currOffset + 12, true)
currOffset += 16
const chunks = new Array(chunkCount)
for (let k = 0; k < chunkCount; k += 1) {
Expand Down Expand Up @@ -186,14 +192,15 @@ export default class CSI extends IndexFile {
}
}

parsePseudoBin(bytes: Buffer, offset: number) {
const lineCount = longToNumber(
Long.fromBytesLE(
bytes.slice(offset + 28, offset + 36) as unknown as number[],
true,
parsePseudoBin(bytes: Uint8Array, offset: number) {
return {
lineCount: longToNumber(
Long.fromBytesLE(
bytes.slice(offset + 28, offset + 36) as unknown as number[],
true,
),
),
)
return { lineCount }
}
}

async blocksForRange(
Expand Down
2 changes: 1 addition & 1 deletion src/indexFile.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { GenericFilehandle } from 'generic-filehandle'
import { GenericFilehandle } from 'generic-filehandle2'
import VirtualOffset from './virtualOffset'
import Chunk from './chunk'

Expand Down
47 changes: 13 additions & 34 deletions src/tabixIndexedFile.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import AbortablePromiseCache from '@gmod/abortable-promise-cache'
import LRU from 'quick-lru'
import { Buffer } from 'buffer'
import { GenericFilehandle, RemoteFile, LocalFile } from 'generic-filehandle'
import { GenericFilehandle, RemoteFile, LocalFile } from 'generic-filehandle2'
import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'
import { checkAbortSignal } from './util'
import IndexFile, { Options, IndexData } from './indexFile'
Expand All @@ -17,17 +16,14 @@ function isASCII(str: string) {

type GetLinesCallback = (line: string, fileOffset: number) => void

const decoder =
typeof TextDecoder !== 'undefined' ? new TextDecoder('utf8') : undefined

interface GetLinesOpts {
[key: string]: unknown
signal?: AbortSignal
lineCallback: GetLinesCallback
}

interface ReadChunk {
buffer: Buffer
buffer: Uint8Array
cpositions: number[]
dpositions: number[]
}
Expand Down Expand Up @@ -196,6 +192,7 @@ export default class TabixIndexedFile {

const chunks = await this.index.blocksForRange(refName, start, end, options)
checkAbortSignal(signal)
const decoder = new TextDecoder('utf8')

// now go through each chunk and parse and filter the lines out of it
for (const c of chunks) {
Expand All @@ -209,11 +206,11 @@ export default class TabixIndexedFile {
let blockStart = 0
let pos = 0

const str = decoder?.decode(buffer) ?? buffer.toString()
// fast path, Buffer is just ASCII chars and not gigantor, can be
// converted to string and processed directly. if it is not ASCII or
// gigantic (chrome max str len is 512Mb), we have to decode line by line
const strIsASCII = buffer.length < 500_000_000 && isASCII(str)
const str = decoder.decode(buffer)
const strIsASCII = isASCII(str)
while (blockStart < str.length) {
let line: string
let n: number
Expand All @@ -224,12 +221,12 @@ export default class TabixIndexedFile {
}
line = str.slice(blockStart, n)
} else {
n = buffer.indexOf('\n', blockStart)
n = buffer.indexOf('\n'.charCodeAt(0), blockStart)
if (n === -1) {
break
}
const b = buffer.slice(blockStart, n)
line = decoder?.decode(b) ?? b.toString()
line = decoder.decode(b)
}

// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
Expand Down Expand Up @@ -292,10 +289,10 @@ export default class TabixIndexedFile {
checkAbortSignal(opts.signal)

const maxFetch = (firstDataLine?.blockPosition || 0) + maxBlockSize
// TODO: what if we don't have a firstDataLine, and the header
// actually takes up more than one block? this case is not covered here
// TODO: what if we don't have a firstDataLine, and the header actually
// takes up more than one block? this case is not covered here

const buf = await this._readRegion(0, maxFetch, opts)
const buf = await this.filehandle.read(0, maxFetch, opts)
const bytes = await unzip(buf)

// trim off lines after the last non-meta line
Expand Down Expand Up @@ -492,32 +489,14 @@ export default class TabixIndexedFile {
return this.index.lineCount(refName, opts)
}

async _readRegion(pos: number, size: number, opts: Options = {}) {
const b = Buffer.alloc(size)
const { bytesRead, buffer } = await this.filehandle.read(
b,
0,
size,
pos,
opts,
)

return buffer.subarray(0, bytesRead)
}

/**
* read and uncompress the data in a chunk (composed of one or more
* contiguous bgzip blocks) of the file
*/
async readChunk(c: Chunk, opts: Options = {}) {
// fetch the uncompressed data, uncompress carefully a block at a time, and
// stop when done

const data = await this._readRegion(
c.minv.blockPosition,
c.fetchedSize(),
opts,
return unzipChunkSlice(
await this.filehandle.read(c.minv.blockPosition, c.fetchedSize(), opts),
c,
)
return unzipChunkSlice(data, c)
}
}
Loading

0 comments on commit 9e5a27d

Please sign in to comment.