JavaScript / WASM API Reference¶

This page embeds pkg/parot.d.ts verbatim. wasm-pack regenerates that file from the Rust source on every just wasm-build, so the signatures and JSDoc below are always the live contract. Never edit this page by hand.
Published as parot on npm and as an ESM module for browsers. For installation, quickstart, and usage patterns see the Guide.
Offsets are UTF-16 code units, so text.slice(i, j) in JS indexes findAll() results correctly.
The docs/api-contract.yaml file enforces that every symbol below also appears in the contract tests — just test-api-js fails CI if the TypeScript declarations drift.
Type declarations¶

/* tslint:disable */
/* eslint-disable */

/**
 * Compressed full-text index. Build once from text or bytes, then query
 * patterns in O(p) time — independent of text length.
 */
export class WasmIndex {
    free(): void;
    [Symbol.dispose](): void;
    /**
     * Boolean mask for multiple patterns at once.
     * Returns a flattened Uint8Array of length `patterns.length * n_segments`.
     */
    batchContainsMask(patterns: string[], boundaries?: Uint32Array | null): Uint8Array;
    /**
     * Batch count: flat Uint32Array of counts, one per pattern.
     */
    batchCount(patterns: string[]): Uint32Array;
    /**
     * Per-segment counts for multiple patterns at once.
     * Returns a flattened Uint32Array of length `patterns.length * n_segments`.
     */
    batchCountPerSegment(patterns: string[], boundaries?: Uint32Array | null): Uint32Array;
    /**
     * Batch extract: Arrow-shape columnar results, one object per pattern.
     * Each object has shape:
     * ```js
     * {
     *   textBytes: Uint8Array,       // concatenated UTF-8 bytes
     *   textOffsets: Uint32Array,     // length = n_hits + 1
     * }
     * ```
     */
    batchExtract(patterns: string[], max_context?: number | null): Array<any>;
    /**
     * Batch locate: returns an Array of Uint32Array, one per pattern.
     * Positions are UTF-16/char offsets when built from a string; byte offsets otherwise.
     */
    batchFindAll(patterns: string[]): Array<any>;
    /**
     * Batch segment locate: Array of Uint32Array of segment IDs, one per pattern.
     */
    batchFindSegments(patterns: string[], boundaries?: Uint32Array | null): Array<any>;
    /**
     * Batch search: Arrow-shape columnar results, one object per pattern.
     * Each object has shape:
     * ```js
     * {
     *   positions: Uint32Array,       // UTF-16 code unit offsets
     *   starts: Uint32Array,
     *   ends: Uint32Array,
     *   matched_bytes: Uint8Array,    // concatenated UTF-8 bytes
     *   matched_offsets: Uint32Array, // length = n_hits + 1
     *   before_bytes: Uint8Array,
     *   before_offsets: Uint32Array,
     *   after_bytes: Uint8Array,
     *   after_offsets: Uint32Array,
     * }
     * ```
     * Positions/starts/ends are UTF-16 code unit offsets when built from a
     * string (compatible with `String.slice()`); byte offsets otherwise.
     * Requires the index to be built from a string.
     * Note: single-pattern `search` remains row-oriented.
     */
    batchSearch(patterns: string[], context?: number | null): any;
    /**
     * Boolean mask: which segments contain the pattern?
     * Uses stored boundaries from `fromStrings`, or pass explicit `boundaries`.
     */
    containsMask(pattern: string, boundaries?: Uint32Array | null): Uint8Array;
    /**
     * Boolean mask for a byte pattern.
     */
    containsMaskBytes(pattern: Uint8Array, boundaries?: Uint32Array | null): Uint8Array;
    /**
     * Count occurrences of a pattern in O(p) time.
     */
    count(pattern: string): number;
    /**
     * Count occurrences of a byte pattern.
     */
    countBytes(pattern: Uint8Array): number;
    /**
     * Count per segment.
     */
    countPerSegment(pattern: string, boundaries?: Uint32Array | null): Uint32Array;
    /**
     * Count per segment for a byte pattern.
     */
    countPerSegmentBytes(pattern: Uint8Array, boundaries?: Uint32Array | null): Uint32Array;
    /**
     * Restore an index from bytes produced by `serialize()`.
     */
    static deserialize(data: Uint8Array): WasmIndex;
    /**
     * Extract matched text with forward context for each match (Arrow shape).
     * Returns a single object with the same columnar Arrow layout as
     * `batchExtract`: `{ textBytes: Uint8Array, textOffsets: Uint32Array }`.
     */
    extract(pattern: string, max_context?: number | null): any;
    /**
     * Return matching segments as a columnar Arrow object.
     *
     * Returns `{ textBytes: Uint8Array, textOffsets: Uint32Array }` where
     * `textOffsets` has length `n_matches + 1` (Arrow string column layout).
     * Requires the index to be built with `fromStrings`.
     */
    filter(pattern: string): any;
    /**
     * Locate all positions where pattern occurs. Returns a Uint32Array of positions.
     * When constructed from a string, positions are UTF-16 character offsets
     * (compatible with JavaScript's `String.slice()`). When constructed from
     * bytes, positions are raw byte offsets.
     */
    findAll(pattern: string): Uint32Array;
    /**
     * Locate all positions of a byte pattern.
     */
    findAllBytes(pattern: Uint8Array): Uint32Array;
    /**
     * Unique segment IDs containing the pattern.
     */
    findSegments(pattern: string, boundaries?: Uint32Array | null): Uint32Array;
    /**
     * Unique segment IDs for a byte pattern.
     */
    findSegmentsBytes(pattern: Uint8Array, boundaries?: Uint32Array | null): Uint32Array;
    /**
     * Create an FM-index from Arrow-style string column buffers (zero-copy ingress).
     *
     * `text_bytes` is the concatenated UTF-8 document bytes (Arrow data buffer).
     * `text_offsets` is the Arrow-style offsets array (length = n_docs + 1).
     */
    static fromArrow(text_bytes: Uint8Array, text_offsets: Uint32Array, locate_level?: number | null, case_insensitive?: boolean | null, separator?: string | null): WasmIndex;
    /**
     * Create an FM-index from a raw byte corpus (`Uint8Array`).
     *
     * This is the byte-equivalent of `new Index(text)`. It builds a
     * brand-new index — *not* a deserializer. To restore an index from
     * `serialize()` output, use `Index.deserialize(bytes)`.
     */
    static fromByteCorpus(data: Uint8Array, locate_level?: number | null, case_insensitive?: boolean | null, normalize_whitespace?: boolean | null): WasmIndex;
    /**
     * Create an FM-index from an array of strings, storing boundaries internally.
     * Segment-aware methods can then be called without passing boundaries.
     */
    static fromStrings(strings: string[], locate_level?: number | null, case_insensitive?: boolean | null, separator?: string | null): WasmIndex;
    /**
     * Return matching segment IDs plus segment text as a columnar Arrow object.
     *
     * Returns `{ segmentIds: Uint32Array, textBytes: Uint8Array, textOffsets: Uint32Array }`
     * where `textOffsets` has length `n_matches + 1`.
     * Requires the index to be built with `fromStrings`.
     */
    grep(pattern: string): any;
    /**
     * Heap memory used by the index in bytes.
     */
    heapSize(): number;
    /**
     * `String.prototype.includes`-style: true iff the pattern occurs at
     * least once in the indexed text.
     */
    includes(pattern: string): boolean;
    /**
     * `String.prototype.indexOf`-style: returns the first UTF-16 code-unit
     * offset of `pattern`, or -1 if not found. When built from bytes,
     * returns the byte offset.
     *
     * Returns `f64` rather than `i64` so JS users get a plain `Number`
     * instead of a `BigInt` (matching `String.prototype.indexOf`). f64
     * losslessly represents integers up to 2^53, which is well above any
     * realistic indexed corpus size.
     */
    indexOf(pattern: string): number;
    /**
     * Create an FM-index from a UTF-8 string.
     * `locate_level` controls the space/time tradeoff for locate():
     *   0 = full SA, fastest locate (default); higher = smaller index, slower locate.
     * `case_insensitive` — lowercase text and queries for case-insensitive matching.
     * `normalize_whitespace` — collapse whitespace runs; locate() remaps positions to original text.
     */
    constructor(text: string, locate_level?: number | null, case_insensitive?: boolean | null, normalize_whitespace?: boolean | null);
    /**
     * Search for pattern and return structured results with context (Arrow shape).
     * Returns a single object with the same columnar Arrow layout as
     * `batchSearch`: typed-array position columns and Arrow string columns
     * for `matched`, `before`, `after`.
     * Works with any construction path (string, bytes, fromStrings, fromArrow).
     */
    search(pattern: string, context?: number | null): any;
    /**
     * Serialize the index to bytes for persistence. The returned bytes can be
     * stored (IndexedDB, localStorage, file download) and restored with
     * `Index.deserialize()`.
     */
    serialize(): Uint8Array;
    /**
     * Install a license for the current process from a Keygen offline signed
     * license file. `licenseFileBytes` is the raw bytes of the file (the
     * armored PEM-like block starting with `-----BEGIN LICENSE FILE-----`).
     * Throws if the signature is invalid, the file is malformed, or a
     * license is already installed.
     */
    static setLicenseKey(license_file_bytes: Uint8Array): void;
    /**
     * `String.prototype.slice`-style: extract a substring of the indexed
     * source by UTF-16 code-unit range. Out-of-range indices are clamped.
     * Throws if the index has no source text (built via `fromByteCorpus`
     * without a UTF-8 payload).
     */
    slice(start: number, end: number): string;
    /**
     * Pretty print: `Index(len=N, segments=M, memory=…)`.
     */
    toString(): string;
    /**
     * Normalization config: `{caseInsensitive, normalizeWhitespace}`.
     */
    readonly config: any;
    /**
     * True iff the index was built via `fromStrings` and has stored segments.
     */
    readonly hasSegments: boolean;
    /**
     * Length of the indexed text.
     */
    readonly length: number;
    /**
     * Heap memory in bytes (alias of `heapSize`).
     */
    readonly memoryBytes: number;
    /**
     * Number of stored segments, or `null` if none.
     */
    readonly segmentCount: any;
}

/**
 * Batch passage extraction: one reference vs many candidates. Returns an
 * `Array<object>`, one columnar payload per candidate.
 */
export function batchCommonPassages(reference: string, candidates: string[]): any;

/**
 * Batch similarity: one reference vs many candidates. Returns a Float64Array
 * of coverage scores, one per candidate, preserving input order.
 */
export function batchTextSimilarity(reference: string, candidates: string[]): Float64Array;

/**
 * All shared passages between two texts, returned as a columnar Arrow object.
 *
 * Offsets are UTF-16 code units (JavaScript-native), so `textA.slice(aStart, aEnd)`
 * and `textB.slice(bStart, bEnd)` yield the matched substrings.
 */
export function commonPassages(text_a: string, text_b: string): any;

/**
 * Count occurrences of a byte pattern in byte data.
 */
export function count_pattern_bytes(data: Uint8Array, pattern: Uint8Array): number;

/**
 * Count the number of distinct substrings (UTF-16 model).
 * Delegates to the core pure Rust implementation.
 */
export function distinct_substring_count(text: string): number;

/**
 * Count the number of distinct substrings on raw bytes.
 */
export function distinct_substring_count_bytes(data: Uint8Array): number;

/**
 * Find duplicate phrases with automatic text normalization (columnar Arrow shape).
 *
 * Convenience wrapper that handles lowercasing and/or whitespace collapsing internally.
 * Returned occurrence offsets always reference the original `source_text`.
 *
 * Returns a single object with the same columnar Arrow keys as
 * [`process_test_string`].
 */
export function find_duplicates_normalized(source_text: string, case_insensitive?: boolean | null, collapse_whitespace?: boolean | null, min_phrase_length?: number | null, min_string_length?: number | null, max_phrase_length?: number | null, min_words_in_substring?: number | null, enable_block_detection?: boolean | null, clip_sentences?: boolean | null): any;

export function init(): void;

/**
 * Find the longest common substring between two strings.
 * Delegates to the core pure Rust implementation.
 */
export function longest_common_substring(text1: string, text2: string): string;

/**
 * Process text to find duplicate phrases (columnar Arrow shape).
 * This is the main entry point called from JavaScript/TypeScript.
 *
 * `removed_ranges_flat` is a flattened array of byte-offset pairs `[start1, end1, start2, end2, ...]`
 * describing ranges removed from `source_text` to produce `sanitized_text`.
 * When non-empty, analysis runs on `sanitized_text` but returned offsets reference `source_text`.
 * Pass an empty array (or `[]`) for backward-compatible behavior (analyze `source_text` directly).
 *
 * Returns a single object with columnar keys: `phraseBytes`, `phraseOffsets`,
 * `count`, `stringLength`, `numberOfWords`, `occurrenceStarts`, `occurrenceEnds`,
 * `occurrenceParentOffsets`. Decode phrase `i`'s occurrences via the parent
 * offsets array: `occurrenceStarts.slice(parent[i], parent[i+1])`.
 */
export function process_test_string(sanitized_text: string, source_text: string, removed_ranges_flat: Uint32Array, min_phrase_length: number, min_string_length: number, max_phrase_length: number, min_words_in_substring: number, enable_block_detection: boolean, clip_sentences: boolean): any;

/**
 * Similarity score between two texts (cross-boundary coverage).
 */
export function textSimilarity(text_a: string, text_b: string): number;