diff --git a/src/Arrow.dom.ts b/src/Arrow.dom.ts index 777b8599..a84411df 100644 --- a/src/Arrow.dom.ts +++ b/src/Arrow.dom.ts @@ -74,7 +74,7 @@ export { tableFromIPC, tableToIPC, MessageReader, AsyncMessageReader, JSONMessageReader, Message, - RecordBatch, + RecordBatch, recordBatchFromArrays, util, Builder, makeBuilder, builderThroughIterable, builderThroughAsyncIterable, compressionRegistry, CompressionType, diff --git a/src/Arrow.ts b/src/Arrow.ts index 848156d8..37b979bb 100644 --- a/src/Arrow.ts +++ b/src/Arrow.ts @@ -99,7 +99,7 @@ export { compressionRegistry } from './ipc/compression/registry.js'; export type { Codec } from './ipc/compression/registry.js'; export { MessageReader, AsyncMessageReader, JSONMessageReader } from './ipc/message.js'; export { Message } from './ipc/metadata/message.js'; -export { RecordBatch } from './recordbatch.js'; +export { RecordBatch, recordBatchFromArrays } from './recordbatch.js'; export type { ArrowJSONLike, FileHandle, Readable, Writable, ReadableWritable, ReadableDOMStreamOptions } from './io/interfaces.js'; export { diff --git a/src/factories.ts b/src/factories.ts index 657ae1b9..eba97ec2 100644 --- a/src/factories.ts +++ b/src/factories.ts @@ -80,9 +80,35 @@ export function vectorFromArray(data: DataProps): export function vectorFromArray(data: T): Vector>; export function vectorFromArray(init: any, type?: dtypes.DataType) { - if (init instanceof Data || init instanceof Vector || init.type instanceof dtypes.DataType || ArrayBuffer.isView(init)) { + if (init instanceof Data || init instanceof Vector || init.type instanceof dtypes.DataType) { return makeVector(init as any); } + if (ArrayBuffer.isView(init) && !type) { + return makeVector(init as any); + } + if (ArrayBuffer.isView(init) && type) { + // Validate BigInt/number boundary + const isBigIntInput = init instanceof BigInt64Array || init instanceof BigUint64Array; + const isBigIntTarget = type.ArrayType === BigInt64Array || type.ArrayType === BigUint64Array; + if (isBigIntInput && !isBigIntTarget) { + throw new TypeError( + `Cannot convert BigInt input to ${type}. BigInt arrays can only target BigInt-based types (e.g. Int64, Uint64).` + ); + } + if (!isBigIntInput && isBigIntTarget) { + throw new TypeError( + `Cannot convert non-BigInt input to ${type}. ${type} requires BigInt values.` + ); + } + + // Fast path: direct TypedArray conversion for Int and Float types + if (dtypes.DataType.isInt(type) || dtypes.DataType.isFloat(type)) { + const data = init.constructor === type.ArrayType + ? init // zero-copy, same TypedArray type + : new (type.ArrayType as any)(init); // standard JS TypedArray conversion + return makeVector({ type, data, offset: 0, length: data.length, nullCount: 0 } as any); + } + } const options: IterableBuilderOptions = { type: type ?? inferType(init), nullValues: [null] }; const chunks = [...builderThroughIterable(options)(init)]; const vector = chunks.length === 1 ? chunks[0] : chunks.reduce((a, b) => a.concat(b)); diff --git a/src/recordbatch.ts b/src/recordbatch.ts index cb4dbf9b..cf6a043f 100644 --- a/src/recordbatch.ts +++ b/src/recordbatch.ts @@ -21,6 +21,8 @@ import { Vector } from './vector.js'; import { Schema, Field } from './schema.js'; import { DataType, Struct, Null, TypeMap } from './type.js'; import { wrapIndex } from './util/vector.js'; +import { vectorFromArray } from './factories.js'; +import { ArrayDataType, BigIntArray, TypedArray } from './interfaces.js'; import { instance as getVisitor } from './visitor/get.js'; import { instance as setVisitor } from './visitor/set.js'; @@ -306,6 +308,61 @@ Object.defineProperty(RecordBatch, Symbol.hasInstance, { }, }); +/** + * Creates a new RecordBatch from an object of typed arrays or JavaScript arrays. + * + * @example + * ```ts + * const batch = recordBatchFromArrays({ + * a: [1, 2, 3], + * b: new Int8Array([1, 2, 3]), + * }); + * ``` + * + * @example + * ```ts + * const schema = new Schema([ + * new Field('a', new Int32), + * new Field('b', new Utf8), + * ]); + * const batch = recordBatchFromArrays({ a: [1, 2, 3], b: ['x', 'y', 'z'] }, schema); + * ``` + * + * @param input An object mapping column names to typed arrays or JavaScript arrays. + * @param schema Optional schema to control column types, ordering, nullability, and metadata. + * @returns A new RecordBatch. + */ +export function recordBatchFromArrays( + input: Record, + schema: Schema +): RecordBatch; +export function recordBatchFromArrays>( + input: I +): RecordBatch<{ [P in keyof I]: ArrayDataType }>; +export function recordBatchFromArrays( + input: Record, + schema?: Schema +): RecordBatch { + if (schema) { + const children: Data[] = []; + for (const field of schema.fields) { + const col = input[field.name]; + if (col === undefined) { + throw new TypeError( + `Schema field "${field.name}" not found in input. ` + + `Available keys: [${Object.keys(input).join(', ')}]` + ); + } + children.push(vectorFromArray(col as any, field.type).data[0]); + } + return new RecordBatch(schema, makeData({ type: new Struct(schema.fields), children })); + } + const dataMap: Record = {}; + for (const [key, col] of Object.entries(input)) { + dataMap[key] = vectorFromArray(col).data[0]; + } + return new RecordBatch(dataMap as any); +} /** @ignore */ function ensureSameLengthData( diff --git a/src/table.ts b/src/table.ts index 2caeb75a..d6464f8d 100644 --- a/src/table.ts +++ b/src/table.ts @@ -454,15 +454,48 @@ export function makeTable * }) * ``` * - * @param input Input an object of typed arrays or JavaScript arrays. + * @example + * ```ts + * const schema = new Schema([ + * new Field('a', new Int32), + * new Field('b', new Utf8), + * ]); + * const table = tableFromArrays({ a: [1, 2, 3], b: ['x', 'y', 'z'] }, schema); + * ``` + * + * @param input An object mapping column names to typed arrays or JavaScript arrays. + * @param schema Optional schema to control column types, ordering, nullability, and metadata. * @returns A new Table. */ -export function tableFromArrays>(input: I): Table<{ [P in keyof I]: ArrayDataType }> { - type T = { [P in keyof I]: ArrayDataType }; - const vecs = {} as VectorsMap; - const inputs = Object.entries(input) as [keyof I, I[keyof I]][]; - for (const [key, col] of inputs) { +export function tableFromArrays( + input: Record, + schema: Schema +): Table; +export function tableFromArrays>( + input: I +): Table<{ [P in keyof I]: ArrayDataType }>; +export function tableFromArrays( + input: Record, + schema?: Schema +): Table { + if (schema) { + const vecs: Vector[] = []; + for (const field of schema.fields) { + const col = input[field.name]; + if (col === undefined) { + throw new TypeError( + `Schema field "${field.name}" not found in input. ` + + `Available keys: [${Object.keys(input).join(', ')}]` + ); + } + vecs.push(vectorFromArray(col as any, field.type)); + } + const [adjustedSchema, batches] = distributeVectorsIntoRecordBatches(schema, vecs); + return new Table(adjustedSchema, batches); + } + const vecs = {} as Record; + for (const [key, col] of Object.entries(input)) { vecs[key] = vectorFromArray(col); } - return new Table(vecs); + return new Table(vecs); } diff --git a/test/unit/recordbatch/record-batch-tests.ts b/test/unit/recordbatch/record-batch-tests.ts index fe537c86..7d6043ef 100644 --- a/test/unit/recordbatch/record-batch-tests.ts +++ b/test/unit/recordbatch/record-batch-tests.ts @@ -18,7 +18,7 @@ import '../../jest-extensions.js'; import { arange } from '../utils.js'; -import { RecordBatch, makeVector } from 'apache-arrow'; +import { RecordBatch, makeVector, recordBatchFromArrays, Schema, Field, Int32, Float32, Float64, Utf8, Dictionary } from 'apache-arrow'; function numsRecordBatch(i32Len: number, f32Len: number) { return new RecordBatch({ @@ -130,3 +130,98 @@ describe(`RecordBatch`, () => { }); }); }); + +describe(`recordBatchFromArrays()`, () => { + test(`creates a RecordBatch from typed arrays and JavaScript arrays`, () => { + const batch = recordBatchFromArrays({ + a: new Float32Array([1, 2, 3]), + b: [4, 5, 6], + c: ['x', 'y', 'z'], + }); + + expect(batch.numRows).toBe(3); + expect(batch.numCols).toBe(3); + expect(batch.getChild('a')!.type).toBeInstanceOf(Float32); + expect(batch.getChild('b')!.type).toBeInstanceOf(Float64); + expect(batch.getChild('c')!.type).toBeInstanceOf(Dictionary); + }); + + test(`schema overrides type inference`, () => { + const schema = new Schema([ + new Field('a', new Int32), + new Field('b', new Utf8), + ]); + const batch = recordBatchFromArrays({ a: [1, 2, 3], b: ['x', 'y', 'z'] }, schema); + + expect(batch.numRows).toBe(3); + expect(batch.getChild('a')!.type).toBeInstanceOf(Int32); + expect(batch.getChild('b')!.type).toBeInstanceOf(Utf8); + expect(batch.getChild('a')!.toArray()).toEqual(new Int32Array([1, 2, 3])); + }); + + test(`schema coerces TypedArray type`, () => { + const schema = new Schema([new Field('a', new Int32)]); + const batch = recordBatchFromArrays({ a: new Float32Array([1, 2, 3]) }, schema); + expect(batch.getChild('a')!.type).toBeInstanceOf(Int32); + expect(batch.getChild('a')!.toArray()).toEqual(new Int32Array([1, 2, 3])); + }); + + test(`preserves schema metadata`, () => { + const schema = new Schema( + [new Field('a', new Int32)], + new Map([['source', 'test']]) + ); + const batch = recordBatchFromArrays({ a: [1, 2, 3] }, schema); + expect(batch.schema.metadata.get('source')).toBe('test'); + }); + + test(`throws on missing schema field`, () => { + const schema = new Schema([new Field('c', new Int32)]); + expect(() => recordBatchFromArrays({ a: [1] }, schema)).toThrow(TypeError); + expect(() => recordBatchFromArrays({ a: [1] }, schema)).toThrow(/Schema field "c" not found in input/); + }); + + test(`handles different length columns via ensureSameLengthData`, () => { + const schema = new Schema([ + new Field('a', new Int32), + new Field('b', new Int32), + ]); + const batch = recordBatchFromArrays({ a: [1, 2, 3], b: [4, 5] }, schema); + expect(batch.numRows).toBe(3); + expect(batch.getChild('a')!).toHaveLength(3); + expect(batch.getChild('b')!).toHaveLength(3); + expect(batch.getChild('b')!.nullCount).toBe(1); + }); + + test(`preserves field ordering from schema`, () => { + const schema = new Schema([ + new Field('b', new Float64), + new Field('a', new Int32), + ]); + const batch = recordBatchFromArrays({ a: [1, 2, 3], b: [4.0, 5.0, 6.0] }, schema); + expect(batch.schema.fields[0].name).toBe('b'); + expect(batch.schema.fields[1].name).toBe('a'); + expect(batch.getChild('b')!.type).toBeInstanceOf(Float64); + expect(batch.getChild('a')!.type).toBeInstanceOf(Int32); + }); + + test(`handles empty arrays`, () => { + const schema = new Schema([new Field('a', new Int32)]); + const batch = recordBatchFromArrays({ a: new Int32Array(0) }, schema); + expect(batch.numRows).toBe(0); + expect(batch.numCols).toBe(1); + expect(batch.getChild('a')!.type).toBeInstanceOf(Int32); + }); + + test(`basic creation without schema infers types`, () => { + const batch = recordBatchFromArrays({ + f32: new Float32Array([1, 2]), + nums: [1, 2, 3], + strs: ['a', 'b'], + }); + + expect(batch.getChild('f32')!.type).toBeInstanceOf(Float32); + expect(batch.getChild('nums')!.type).toBeInstanceOf(Float64); + expect(batch.getChild('strs')!.type).toBeInstanceOf(Dictionary); + }); +}); diff --git a/test/unit/table/table-test.ts b/test/unit/table/table-test.ts index 01af4090..55a6ab7b 100644 --- a/test/unit/table/table-test.ts +++ b/test/unit/table/table-test.ts @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -import { Bool, Dictionary, Float32, Float64, Int32, Int8, makeTable, tableFromArrays, tableFromJSON } from 'apache-arrow'; +import { Bool, DataType, Dictionary, Float32, Float64, Int32, Int8, Utf8, Schema, Field, makeTable, tableFromArrays, tableFromJSON, tableToIPC, tableFromIPC, Type } from 'apache-arrow'; describe('makeTable()', () => { test(`creates a new Table from Typed Arrays`, () => { @@ -59,6 +59,172 @@ describe('tableFromArrays()', () => { }); +describe('tableFromArrays() with schema', () => { + test(`schema overrides number inference to Int32`, () => { + const schema = new Schema([new Field('a', new Int32)]); + const table = tableFromArrays({ a: [1, 2, 3] }, schema); + expect(table.getChild('a')!.type).toBeInstanceOf(Int32); + expect(table.numRows).toBe(3); + expect(table.getChild('a')!.toArray()).toEqual(new Int32Array([1, 2, 3])); + }); + + test(`schema overrides string inference to Utf8`, () => { + const schema = new Schema([new Field('b', new Utf8)]); + const table = tableFromArrays({ b: ['a', 'b'] }, schema); + expect(table.getChild('b')!.type).toBeInstanceOf(Utf8); + expect(table.numRows).toBe(2); + expect(table.getChild('b')!.get(0)).toBe('a'); + expect(table.getChild('b')!.get(1)).toBe('b'); + }); + + test(`schema coerces TypedArray type`, () => { + const schema = new Schema([new Field('a', new Int32)]); + const table = tableFromArrays({ a: new Float32Array([1, 2, 3]) }, schema); + expect(table.getChild('a')!.type).toBeInstanceOf(Int32); + expect(table.getChild('a')!.toArray()).toEqual(new Int32Array([1, 2, 3])); + }); + + test(`preserves schema-level metadata`, () => { + const schema = new Schema( + [new Field('a', new Int32)], + new Map([['source', 'test']]) + ); + const table = tableFromArrays({ a: [1, 2, 3] }, schema); + expect(table.schema.metadata.get('source')).toBe('test'); + }); + + test(`preserves field-level metadata`, () => { + const schema = new Schema([ + new Field('a', new Int32, true, new Map([['unit', 'm']])) + ]); + const table = tableFromArrays({ a: [1, 2, 3] }, schema); + expect(table.schema.fields[0].metadata.get('unit')).toBe('m'); + }); + + test(`preserves field ordering from schema`, () => { + const schema = new Schema([ + new Field('b', new Float64), + new Field('a', new Int32), + ]); + const table = tableFromArrays({ a: [1, 2, 3], b: [4.0, 5.0, 6.0] }, schema); + expect(table.schema.fields[0].name).toBe('b'); + expect(table.schema.fields[1].name).toBe('a'); + expect(table.getChild('b')!.type).toBeInstanceOf(Float64); + expect(table.getChild('a')!.type).toBeInstanceOf(Int32); + }); + + test(`throws on missing schema field`, () => { + const schema = new Schema([new Field('c', new Int32)]); + expect(() => tableFromArrays({ a: [1] }, schema)).toThrow(TypeError); + expect(() => tableFromArrays({ a: [1] }, schema)).toThrow(/Schema field "c" not found in input/); + }); + + test(`ignores extra input keys not in schema`, () => { + const schema = new Schema([new Field('a', new Int32)]); + const table = tableFromArrays({ a: [1, 2], b: [3, 4] }, schema); + expect(table.numCols).toBe(1); + expect(table.schema.fields[0].name).toBe('a'); + }); + + test(`preserves nullability`, () => { + const schema = new Schema([new Field('a', new Int32, false)]); + const table = tableFromArrays({ a: [1, 2, 3] }, schema); + expect(table.schema.fields[0].nullable).toBe(false); + }); + + test(`handles null values in input`, () => { + const schema = new Schema([new Field('a', new Int32, true)]); + const table = tableFromArrays({ a: [1, null, 3] }, schema); + expect(table.numRows).toBe(3); + expect(table.getChild('a')!.nullCount).toBeGreaterThan(0); + }); + + test(`BigInt boundary throws for BigInt64Array to Int32`, () => { + const schema = new Schema([new Field('a', new Int32)]); + expect(() => tableFromArrays({ a: new BigInt64Array([1n, 2n]) }, schema)).toThrow(TypeError); + expect(() => tableFromArrays({ a: new BigInt64Array([1n, 2n]) }, schema)).toThrow(/BigInt/); + }); + + test(`handles empty arrays`, () => { + const schema = new Schema([new Field('a', new Int32)]); + const table = tableFromArrays({ a: new Int32Array(0) }, schema); + expect(table.numRows).toBe(0); + expect(table.numCols).toBe(1); + expect(table.getChild('a')!.type).toBeInstanceOf(Int32); + }); +}); + +describe('tableFromArrays() with schema IPC round-trip', () => { + const schema = new Schema([ + new Field('b', new Utf8), + new Field('a', new Int32, true, new Map([['unit', 'meters']])), + new Field('c', new Int8), + ], new Map([['source', 'test']])); + + // Input key order differs from schema field order + const input = { + a: [1, null, 3], + b: ['x', 'y', 'z'], + c: new Float32Array([10, 20, 30]), + }; + + for (const format of ['stream', 'file'] as const) { + test(`round-trips through IPC ${format} format`, () => { + const original = tableFromArrays(input, schema); + const buffer = tableToIPC(original, format); + const table = tableFromIPC(buffer); + + // Row and column counts + expect(table.numRows).toBe(3); + expect(table.numCols).toBe(3); + + // Field ordering matches schema (not input key order) + expect(table.schema.fields[0].name).toBe('b'); + expect(table.schema.fields[1].name).toBe('a'); + expect(table.schema.fields[2].name).toBe('c'); + + // Types match schema-specified types (IPC reconstructs base classes, so check typeId + properties) + const typeA = table.getChild('a')!.type; + expect(DataType.isInt(typeA)).toBe(true); + expect((typeA as any).bitWidth).toBe(32); + + const typeB = table.getChild('b')!.type; + expect(typeB.typeId).toBe(Type.Utf8); + + const typeC = table.getChild('c')!.type; + expect(DataType.isInt(typeC)).toBe(true); + expect((typeC as any).bitWidth).toBe(8); + + // Schema-level metadata + expect(table.schema.metadata.get('source')).toBe('test'); + + // Field-level metadata + const aField = table.schema.fields.find(f => f.name === 'a')!; + expect(aField.metadata.get('unit')).toBe('meters'); + + // Nullability and null counts + expect(aField.nullable).toBe(true); + expect(table.getChild('a')!.nullCount).toBe(1); + + // Data values + const colA = table.getChild('a')!; + expect(colA.get(0)).toBe(1); + expect(colA.get(1)).toBeNull(); + expect(colA.get(2)).toBe(3); + + expect(table.getChild('b')!.get(0)).toBe('x'); + expect(table.getChild('b')!.get(1)).toBe('y'); + expect(table.getChild('b')!.get(2)).toBe('z'); + + // TypedArray coercion: Float32Array input → Int8 output + const colC = table.getChild('c')!; + expect(colC.get(0)).toBe(10); + expect(colC.get(1)).toBe(20); + expect(colC.get(2)).toBe(30); + }); + } +}); + describe('tableFromJSON()', () => { test(`creates table from array of objects`, () => { const table = tableFromJSON([{