From 2006959ea745adf5cb5704ee87c1b00f9b425d5f Mon Sep 17 00:00:00 2001 From: djstrong Date: Tue, 30 Sep 2025 14:27:50 +0200 Subject: [PATCH 01/30] feat: add CSV conversion command to ensrainbow CLI - Introduced `convert-csv` command for converting CSV files to .ensrainbow format. - Added support for single and two-column CSV formats. - Implemented error handling for invalid CSV data. - Created tests for various CSV scenarios, including special characters and invalid formats. - Updated package dependencies to include `csv-simple-parser` for CSV parsing. --- apps/ensrainbow/package.json | 3 +- apps/ensrainbow/src/cli.ts | 49 +++- .../src/commands/convert-csv-command.test.ts | 241 +++++++++++++++++ .../src/commands/convert-csv-command.ts | 248 ++++++++++++++++++ .../test/fixtures/test_labels_1col.csv | 10 + .../test/fixtures/test_labels_2col.csv | 10 + .../fixtures/test_labels_invalid_first.csv | 3 + .../fixtures/test_labels_invalid_hash.csv | 4 + .../fixtures/test_labels_special_chars.csv | 10 + pnpm-lock.yaml | 17 +- 10 files changed, 591 insertions(+), 4 deletions(-) create mode 100644 apps/ensrainbow/src/commands/convert-csv-command.test.ts create mode 100644 apps/ensrainbow/src/commands/convert-csv-command.ts create mode 100644 apps/ensrainbow/test/fixtures/test_labels_1col.csv create mode 100644 apps/ensrainbow/test/fixtures/test_labels_2col.csv create mode 100644 apps/ensrainbow/test/fixtures/test_labels_invalid_first.csv create mode 100644 apps/ensrainbow/test/fixtures/test_labels_invalid_hash.csv create mode 100644 apps/ensrainbow/test/fixtures/test_labels_special_chars.csv diff --git a/apps/ensrainbow/package.json b/apps/ensrainbow/package.json index ea7c2b95c..af46315e9 100644 --- a/apps/ensrainbow/package.json +++ b/apps/ensrainbow/package.json @@ -38,7 +38,8 @@ "progress": "^2.0.3", "protobufjs": "^7.4.0", "viem": "catalog:", - "yargs": "^17.7.2" + "yargs": "^17.7.2", + "csv-simple-parser": "^2.0.2" }, "devDependencies": { "@ensnode/shared-configs": "workspace:*", diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index 3fdc0d530..063c48df2 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -13,6 +13,7 @@ import { } from "@ensnode/ensnode-sdk"; import { convertCommand } from "@/commands/convert-command"; +import { convertCsvCommand } from "@/commands/convert-csv-command"; // import { ingestCommand } from "@/commands/ingest-command"; import { ingestProtobufCommand } from "@/commands/ingest-protobuf-command"; import { purgeCommand } from "@/commands/purge-command"; @@ -61,6 +62,13 @@ interface ConvertArgs { "label-set-version": LabelSetVersion; } +interface ConvertCsvArgs { + "input-file": string; + "output-file": string; + "label-set-id": LabelSetId; + "label-set-version": LabelSetVersion; +} + export interface CLIOptions { exitProcess?: boolean; } @@ -184,7 +192,7 @@ export function createCLI(options: CLIOptions = {}) { ) .command( "convert", - "Convert rainbow tables from SQL dump to protobuf format", + "Convert rainbow tables from SQL dump to ensrainbow format", (yargs: Argv) => { return yargs .option("input-file", { @@ -194,7 +202,7 @@ export function createCLI(options: CLIOptions = {}) { }) .option("output-file", { type: "string", - description: "Path to the output protobuf file", + description: "Path to the output ensrainbow file", default: join(process.cwd(), "rainbow-records.ensrainbow"), }) .option("label-set-id", { @@ -219,6 +227,43 @@ export function createCLI(options: CLIOptions = {}) { }); }, ) + .command( + "convert-csv", + "Convert rainbow tables from CSV format to ensrainbow format", + (yargs: Argv) => { + return yargs + .option("input-file", { + type: "string", + description: "Path to the CSV input file", + demandOption: true, + }) + .option("output-file", { + type: "string", + description: "Path to the output ensrainbow file", + default: join(process.cwd(), "rainbow-records.ensrainbow"), + }) + .option("label-set-id", { + type: "string", + description: "Label set id for the rainbow record collection", + demandOption: true, + }) + .coerce("label-set-id", buildLabelSetId) + .option("label-set-version", { + type: "number", + description: "Label set version for the rainbow record collection", + demandOption: true, + }) + .coerce("label-set-version", buildLabelSetVersion); + }, + async (argv: ArgumentsCamelCase) => { + await convertCsvCommand({ + inputFile: argv["input-file"], + outputFile: argv["output-file"], + labelSetId: argv["label-set-id"], + labelSetVersion: argv["label-set-version"], + }); + }, + ) .demandCommand(1, "You must specify a command") .fail((msg, err, yargs) => { if (process.env.VITEST) { diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts new file mode 100644 index 000000000..2be46d924 --- /dev/null +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -0,0 +1,241 @@ +import { tmpdir } from "os"; +import { join } from "path"; +import { mkdtemp, rm, stat, writeFile } from "fs/promises"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +import { createCLI } from "@/cli"; +import { type LabelSetId, type LabelSetVersion } from "@ensnode/ensnode-sdk"; +import { convertCsvCommand } from "./convert-csv-command"; + +// Path to test fixtures +const TEST_FIXTURES_DIR = join(__dirname, "..", "..", "test", "fixtures"); + +describe("convert-csv-command", () => { + let tempDir: string; + + beforeEach(async () => { + vi.stubEnv("NODE_ENV", "test"); + tempDir = await mkdtemp(join(tmpdir(), "ensrainbow-csv-test-")); + }); + + afterEach(async () => { + vi.unstubAllEnvs(); + vi.restoreAllMocks(); + await rm(tempDir, { recursive: true, force: true }); + }); + + describe("CSV conversion and ingestion", () => { + it("should convert single column CSV and successfully ingest into database", async () => { + const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv"); + const outputFile = join(tempDir, "output_1col.ensrainbow"); + const dataDir = join(tempDir, "db_1col"); + + // Convert CSV to ensrainbow format + await convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-csv-one-col" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + }); + + // Verify the output file was created + const stats = await stat(outputFile); + expect(stats.isFile()).toBe(true); + expect(stats.size).toBeGreaterThan(0); + + // Ingest the converted file into database + const cli = createCLI({ exitProcess: false }); + await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]); + + // Verify database was created + const dbStats = await stat(dataDir); + expect(dbStats.isDirectory()).toBe(true); + + // Verify database contents by validating it + await cli.parse(["validate", "--data-dir", dataDir, "--lite"]); + + // Database validation passed, which means records are accessible + }); + + it("should convert two column CSV with provided hashes and ingest successfully", async () => { + const inputFile = join(TEST_FIXTURES_DIR, "test_labels_2col.csv"); + const outputFile = join(tempDir, "output_2col.ensrainbow"); + const dataDir = join(tempDir, "db_2col"); + + // Convert CSV to ensrainbow format + await convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-csv-two-col" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + }); + + // Verify the output file was created + const stats = await stat(outputFile); + expect(stats.isFile()).toBe(true); + expect(stats.size).toBeGreaterThan(0); + + // Ingest the converted file into database + const cli = createCLI({ exitProcess: false }); + await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]); + + // Verify database was created + const dbStats = await stat(dataDir); + expect(dbStats.isDirectory()).toBe(true); + }); + + it("should fail when CSV has inconsistent column count", async () => { + const inputFile = join(TEST_FIXTURES_DIR, "test_labels_invalid_first.csv"); + const outputFile = join(tempDir, "output_invalid.ensrainbow"); + + // Convert CSV to ensrainbow format (should fail on inconsistent columns) + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-csv-invalid" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + }), + ).rejects.toThrow(/CSV conversion failed due to invalid data/); + }); + + it("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => { + const inputFile = join(TEST_FIXTURES_DIR, "test_labels_special_chars.csv"); + const outputFile = join(tempDir, "output_special.ensrainbow"); + + // Convert CSV to ensrainbow format + await convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-csv-special" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + }); + + // Verify output file was created + const outputStats = await stat(outputFile); + expect(outputStats.isFile()).toBe(true); + expect(outputStats.size).toBeGreaterThan(0); + + // Verify special characters were processed correctly by checking logs + // The conversion completed successfully, which means csv-simple-parser + // handled emojis, unicode, quoted fields with commas, etc. + expect(true).toBe(true); // Test passes if conversion doesn't crash + }); + + it("should fail when CSV contains invalid labelhash format", async () => { + const inputFile = join(TEST_FIXTURES_DIR, "test_labels_invalid_hash.csv"); + const outputFile = join(tempDir, "output_invalid_hash.ensrainbow"); + + // Convert CSV to ensrainbow format (should fail on invalid hash format) + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-csv-invalid-hash" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + }), + ).rejects.toThrow(/CSV conversion failed due to invalid data/); + }); + }); + + describe("Error handling", () => { + it("should throw error for non-existent input file", async () => { + const inputFile = join(tempDir, "non-existent.csv"); + const outputFile = join(tempDir, "output.ensrainbow"); + + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-missing" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + }), + ).rejects.toThrow(); + }); + }); + + describe("CLI integration", () => { + it("should work through the full CLI pipeline", async () => { + const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv"); + const outputFile = join(tempDir, "cli_output.ensrainbow"); + const dataDir = join(tempDir, "cli_db"); + + const cli = createCLI({ exitProcess: false }); + + // Test convert-csv command through CLI + await cli.parse([ + "convert-csv", + "--input-file", + inputFile, + "--output-file", + outputFile, + "--label-set-id", + "test-cli-csv", + "--label-set-version", + "0", + ]); + + // Verify file was created + const stats = await stat(outputFile); + expect(stats.isFile()).toBe(true); + expect(stats.size).toBeGreaterThan(0); + + // Test ingestion through CLI + await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]); + + // Verify database was created + const dbStats = await stat(dataDir); + expect(dbStats.isDirectory()).toBe(true); + }); + }); + + describe("Streaming performance", () => { + it("should handle small CSV files efficiently", async () => { + const inputFile = join(tempDir, "small_test.csv"); + const outputFile = join(tempDir, "output_small.ensrainbow"); + const dataDir = join(tempDir, "db_small"); + + // Create a CSV with 100 records to test streaming + const records = []; + for (let i = 0; i < 100; i++) { + records.push(`label${i}`); + } + await writeFile(inputFile, records.join("\n")); + + const startTime = Date.now(); + + // Convert CSV + await convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-small" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + }); + + const conversionTime = Date.now() - startTime; + + // Should complete conversion quickly (less than 2 seconds for 100 records) + expect(conversionTime).toBeLessThan(2000); + + // Verify file was created + const stats = await stat(outputFile); + expect(stats.isFile()).toBe(true); + expect(stats.size).toBeGreaterThan(0); + + // Test ingestion + const cli = createCLI({ exitProcess: false }); + const ingestStartTime = Date.now(); + + await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]); + + const ingestTime = Date.now() - ingestStartTime; + + // Should complete ingestion quickly (less than 3 seconds for 100 records) + expect(ingestTime).toBeLessThan(3000); + + // Verify database was created + const dbStats = await stat(dataDir); + expect(dbStats.isDirectory()).toBe(true); + }); + }); +}); diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts new file mode 100644 index 000000000..1c04fbf5c --- /dev/null +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -0,0 +1,248 @@ +/** + * ENSRAINBOW CSV FILE CREATION COMMAND + * + * Converts CSV files to .ensrainbow format with csv-simple-parser + * Supports 1-column (label only) and 2-column (label,labelhash) formats + */ + +import { createReadStream, createWriteStream } from "fs"; +import { createInterface } from "readline"; +import { type LabelHash, labelHashToBytes } from "@ensnode/ensnode-sdk"; +import parse from "csv-simple-parser"; +import { labelhash } from "viem"; +import { logger } from "../utils/logger.js"; +import { + CURRENT_ENSRAINBOW_FILE_FORMAT_VERSION, + createRainbowProtobufRoot, +} from "../utils/protobuf-schema.js"; + +/** + * Parse CSV using csv-simple-parser + */ +function parseCsvLine(line: string): string[] { + const result = parse(line); + return result.length > 0 ? (result[0] as string[]) : []; +} + +// No label validation - ENS accepts any UTF-8 string + +export interface ConvertCsvCommandOptions { + inputFile: string; + outputFile: string; + labelSetId: string; + labelSetVersion: number; +} + +interface ConversionStats { + totalLines: number; + processedRecords: number; + skippedRecords: number; + invalidLabels: number; + duplicates: number; + startTime: Date; + endTime?: Date; +} + +/** + * Process a single CSV line with csv-simple-parser and validation + */ +function processStreamingCsvLine(line: string, expectedColumns: number): string[] { + if (line.trim() === "") { + throw new Error("Empty line"); + } + + const parsedLine = parseCsvLine(line); + + // Validate column count + if (parsedLine.length !== expectedColumns) { + throw new Error( + `Expected ${expectedColumns} columns, but found ${parsedLine.length} in line: ${line}`, + ); + } + + return parsedLine; +} + +/** + * Setup input stream for reading CSV line by line + */ +function setupReadStream(inputFile: string) { + const fileStream = createReadStream(inputFile, { encoding: "utf8" }); + return createInterface({ + input: fileStream, + crlfDelay: Infinity, + }); +} + +/** + * Setup output stream for writing protobuf + */ +function setupWriteStream(outputFile: string) { + // For now, just write directly to file without gzip compression + return createWriteStream(outputFile); +} + +/** + * Write protobuf header + */ +function writeHeader( + outputStream: NodeJS.WritableStream, + RainbowRecordCollectionType: any, + labelSetId: string, + labelSetVersion: number, +) { + const headerCollection = RainbowRecordCollectionType.fromObject({ + format_identifier: "ensrainbow", + ensrainbow_file_format_version: CURRENT_ENSRAINBOW_FILE_FORMAT_VERSION, + label_set_id: labelSetId, + label_set_version: labelSetVersion, + records: [], // Header has no records + }); + // Encode and write the header collection with length-prefix encoding + outputStream.write( + Buffer.from(RainbowRecordCollectionType.encodeDelimited(headerCollection).finish()), + ); + logger.info("Wrote header message with version, label set id and label set version."); +} + +/** + * Log conversion summary + */ +function logSummary(stats: ConversionStats) { + stats.endTime = new Date(); + const duration = stats.endTime.getTime() - stats.startTime.getTime(); + + logger.info("=== Conversion Summary ==="); + logger.info(`Total lines processed: ${stats.totalLines}`); + logger.info(`Valid records: ${stats.processedRecords}`); + logger.info(`Skipped records: ${stats.skippedRecords}`); + logger.info(`Invalid labels: ${stats.invalidLabels}`); + logger.info(`Duplicates found: ${stats.duplicates}`); + logger.info(`Duration: ${duration}ms`); +} + +/** + * Main CSV conversion command with true streaming using csv-simple-parser + */ +export async function convertCsvCommand(options: ConvertCsvCommandOptions): Promise { + const stats: ConversionStats = { + totalLines: 0, + processedRecords: 0, + skippedRecords: 0, + invalidLabels: 0, + duplicates: 0, + startTime: new Date(), + }; + + try { + logger.info("Starting conversion from CSV to protobuf format..."); + logger.info(`Input file: ${options.inputFile}`); + logger.info(`Output file: ${options.outputFile}`); + logger.info(`Label set id: ${options.labelSetId}`); + logger.info(`Label set version: ${options.labelSetVersion}`); + + // Setup protobuf schema + const { RainbowRecordType, RainbowRecordCollectionType } = createRainbowProtobufRoot(); + + // Setup streams + const outputStream = setupWriteStream(options.outputFile); + + // Write header + writeHeader( + outputStream, + RainbowRecordCollectionType, + options.labelSetId, + options.labelSetVersion, + ); + + logger.info("Reading and processing CSV file line by line with streaming..."); + + // Setup streaming CSV reader + const rl = setupReadStream(options.inputFile); + + let expectedColumns: number | null = null; + let lineNumber = 0; + let processedRecords = 0; + + // Process line by line with csv-simple-parser + for await (const line of rl) { + lineNumber++; + + // Skip empty lines + if (line.trim() === "") { + continue; + } + + try { + // For the first line, detect column count + if (expectedColumns === null) { + const firstLineParsed = parseCsvLine(line); + expectedColumns = firstLineParsed.length; + logger.info(`Detected ${expectedColumns} columns using csv-simple-parser`); + } + + // Parse current line with csv-simple-parser + const parsedColumns = processStreamingCsvLine(line, expectedColumns); + + // Get label (no validation - ENS accepts any UTF-8 string) + const label = parsedColumns[0]; + + // Build rainbow record immediately (streaming) + let rainbowRecord; + + if (parsedColumns.length === 1) { + // Single column: compute labelhash using labelhash function + const labelHashBytes = labelHashToBytes(labelhash(label)); + + rainbowRecord = { + labelhash: Buffer.from(labelHashBytes), + label: label, + }; + } else { + // Two columns: validate and use provided hash + const [, providedHash] = parsedColumns; + + // Ensure the hash has 0x prefix for labelHashToBytes + const maybeLabelHash = providedHash.startsWith("0x") ? providedHash : `0x${providedHash}`; + const labelHash = labelHashToBytes(maybeLabelHash as LabelHash); + + rainbowRecord = { + labelhash: Buffer.from(labelHash), + label: label, + }; + } + + // Create protobuf message and write immediately + const recordMessage = RainbowRecordType.fromObject(rainbowRecord); + outputStream.write(Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish())); + + processedRecords++; + + // Log progress for large files + if (processedRecords % 10000 === 0) { + logger.info(`Processed ${processedRecords} records so far...`); + } + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + throw new Error( + `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`, + ); + } + } + + stats.totalLines = lineNumber; + stats.processedRecords = processedRecords; + + // Close output stream + outputStream.end(); + + logger.info(`✅ Processed ${processedRecords} records with streaming csv-simple-parser`); + + logSummary(stats); + logger.info("✅ CSV conversion completed successfully!"); + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + logger.error("❌ CSV conversion failed:", errorMessage); + throw error; + } +} diff --git a/apps/ensrainbow/test/fixtures/test_labels_1col.csv b/apps/ensrainbow/test/fixtures/test_labels_1col.csv new file mode 100644 index 000000000..d809bd116 --- /dev/null +++ b/apps/ensrainbow/test/fixtures/test_labels_1col.csv @@ -0,0 +1,10 @@ +alice +bob +charlie +domaintest +example +foundation +governance +hello +world +test123 diff --git a/apps/ensrainbow/test/fixtures/test_labels_2col.csv b/apps/ensrainbow/test/fixtures/test_labels_2col.csv new file mode 100644 index 000000000..f410bf758 --- /dev/null +++ b/apps/ensrainbow/test/fixtures/test_labels_2col.csv @@ -0,0 +1,10 @@ +alice,0x9c0257114eb9399a2985f8e75dad7600c5d89fe3824ffa99ec1c3eb8bf3b0501 +bob,0x38e47a7b719dce63662aeaf43440326f551b8a7ee198cee35cb5d517f2d296a2 +charlie,0x87a213ce1ee769e28decedefb98f6fe48890a74ba84957ebf877fb591e37e0de +domaintest,0xc2d1b32ab4268fbba175baa3dcab1eb8299bc784030b080f28eaf1b9336c0445 +example,0x6fd43e7cffc31bb581d7421c8698e29aa2bd8e7186a394b85299908b4eb9b175 +foundation,0x0d5c1bd818a4086f28314415cb375a937593efab66f8f7d2903bf2a13ed35070 +governance,0xabea6fd3db56a6e6d0242111b43ebb13d1c42709651c032c7894962023a1f90a +hello,0x1c8aff950685c2ed4bc3174f3472287b56d9517b9c948127319a09a7a36deac8 +world,0x8452c9b9140222b08593a26daa782707297be9f7b3e8281d7b4974769f19afd0 +test123,0xf81b517a242b218999ec8eec0ea6e2ddbef2a367a14e93f4a32a39e260f686ad diff --git a/apps/ensrainbow/test/fixtures/test_labels_invalid_first.csv b/apps/ensrainbow/test/fixtures/test_labels_invalid_first.csv new file mode 100644 index 000000000..3d0b7b7e0 --- /dev/null +++ b/apps/ensrainbow/test/fixtures/test_labels_invalid_first.csv @@ -0,0 +1,3 @@ +label1,hash1,extra_column +validlabel +another_valid diff --git a/apps/ensrainbow/test/fixtures/test_labels_invalid_hash.csv b/apps/ensrainbow/test/fixtures/test_labels_invalid_hash.csv new file mode 100644 index 000000000..484983db9 --- /dev/null +++ b/apps/ensrainbow/test/fixtures/test_labels_invalid_hash.csv @@ -0,0 +1,4 @@ +validlabel,0x1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef +invalidhash,not-a-hex-hash +anotherlabel,0x123 +toolong,0x1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef123456789 diff --git a/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv b/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv new file mode 100644 index 000000000..a1cc2a55f --- /dev/null +++ b/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv @@ -0,0 +1,10 @@ +🔥emoji-label🚀 +"label,with,commas" +"label with newline\n character" +Ąśćžłñ-unicode +"label-with-null\0byte" +"quoted label with spaces" +中文-chinese +😀😁😂🤣-multiple-emojis +"special""quotes""inside" +café-àçćént diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 2045e22d5..8c8c0b79b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -462,6 +462,9 @@ importers: classic-level: specifier: ^1.4.1 version: 1.4.1 + csv-simple-parser: + specifier: ^2.0.2 + version: 2.0.2 hono: specifier: 'catalog:' version: 4.10.3 @@ -4161,6 +4164,9 @@ packages: csstype@3.2.3: resolution: {integrity: sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==} + csv-simple-parser@2.0.2: + resolution: {integrity: sha512-G9KUSB7Bh8mRjZcg340FJM96tJYPPfb+UjR6T+dOcdRLChmwOTP6jB9+rJwmqDoaPHMJW/CXabYbJ1ZEjbkrrg==} + cytoscape-cose-bilkent@4.1.0: resolution: {integrity: sha512-wgQlVIUJF13Quxiv5e1gstZ08rnZj2XaLHGoFMYXz7SkNfCDOOteKBE6SYRfA9WxxI/iBc3ajfDoc6hb/MRAHQ==} peerDependencies: @@ -4404,6 +4410,9 @@ packages: destr@2.0.5: resolution: {integrity: sha512-ugFTXCtDZunbzasqBxrK93Ik/DRYsO6S/fedkWEMKqt04xZ4csmnmwGDBAb07QWNaGMAmnTIemsYZCksjATwsA==} + detect-eol@3.0.1: + resolution: {integrity: sha512-ncnuLiZCKO7Kt+3CpwUIV8QnnwpBsSFxGQBY6Nve18K2aOrTim2xpzDa8YunHkePt39OCfV2qOX+b7xjYSDRWg==} + detect-indent@6.1.0: resolution: {integrity: sha512-reYkTUJAZb9gUuZ2RvVCNhVHdg62RHnJ7WJl8ftMi4diZ6NWlciOzQN88pUhSELEwflJht4oQDv0F0BMlwaYtA==} engines: {node: '>=8'} @@ -8672,7 +8681,7 @@ snapshots: '@expressive-code/plugin-shiki@0.41.3': dependencies: '@expressive-code/core': 0.41.3 - shiki: 3.14.0 + shiki: 3.15.0 '@expressive-code/plugin-text-markers@0.41.3': dependencies: @@ -11662,6 +11671,10 @@ snapshots: csstype@3.2.3: {} + csv-simple-parser@2.0.2: + dependencies: + detect-eol: 3.0.1 + cytoscape-cose-bilkent@4.1.0(cytoscape@3.33.1): dependencies: cose-base: 1.0.3 @@ -11905,6 +11918,8 @@ snapshots: destr@2.0.5: {} + detect-eol@3.0.1: {} + detect-indent@6.1.0: {} detect-libc@2.1.2: {} From b49144124c2550bc02385f8a4268b6462cd8dbec Mon Sep 17 00:00:00 2001 From: djstrong Date: Tue, 30 Sep 2025 17:25:09 +0200 Subject: [PATCH 02/30] refactor --- apps/ensrainbow/src/cli.ts | 9 +- .../src/commands/convert-csv-command.ts | 242 ++++++++++-------- 2 files changed, 148 insertions(+), 103 deletions(-) diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index 063c48df2..940692729 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -67,6 +67,7 @@ interface ConvertCsvArgs { "output-file": string; "label-set-id": LabelSetId; "label-set-version": LabelSetVersion; + "progress-interval"?: number; } export interface CLIOptions { @@ -253,7 +254,12 @@ export function createCLI(options: CLIOptions = {}) { description: "Label set version for the rainbow record collection", demandOption: true, }) - .coerce("label-set-version", buildLabelSetVersion); + .coerce("label-set-version", buildLabelSetVersion) + .option("progress-interval", { + type: "number", + description: "Number of records to process before logging progress", + default: 10000, + }); }, async (argv: ArgumentsCamelCase) => { await convertCsvCommand({ @@ -261,6 +267,7 @@ export function createCLI(options: CLIOptions = {}) { outputFile: argv["output-file"], labelSetId: argv["label-set-id"], labelSetVersion: argv["label-set-version"], + progressInterval: argv["progress-interval"], }); }, ) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 1c04fbf5c..0b4ed5d6b 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -17,11 +17,14 @@ import { } from "../utils/protobuf-schema.js"; /** - * Parse CSV using csv-simple-parser + * Parse CSV using csv-simple-parser with proper type safety */ function parseCsvLine(line: string): string[] { const result = parse(line); - return result.length > 0 ? (result[0] as string[]) : []; + if (result.length === 0) return []; + const firstRow = result[0]; + if (!Array.isArray(firstRow)) return []; + return firstRow.filter((item) => typeof item === "string"); } // No label validation - ENS accepts any UTF-8 string @@ -31,14 +34,15 @@ export interface ConvertCsvCommandOptions { outputFile: string; labelSetId: string; labelSetVersion: number; + progressInterval?: number; } +// Configuration constants +const DEFAULT_PROGRESS_INTERVAL = 10000; + interface ConversionStats { totalLines: number; processedRecords: number; - skippedRecords: number; - invalidLabels: number; - duplicates: number; startTime: Date; endTime?: Date; } @@ -115,12 +119,123 @@ function logSummary(stats: ConversionStats) { logger.info("=== Conversion Summary ==="); logger.info(`Total lines processed: ${stats.totalLines}`); logger.info(`Valid records: ${stats.processedRecords}`); - logger.info(`Skipped records: ${stats.skippedRecords}`); - logger.info(`Invalid labels: ${stats.invalidLabels}`); - logger.info(`Duplicates found: ${stats.duplicates}`); logger.info(`Duration: ${duration}ms`); } +/** + * Initialize conversion setup and logging + */ +function initializeConversion(options: ConvertCsvCommandOptions) { + logger.info("Starting conversion from CSV to protobuf format..."); + logger.info(`Input file: ${options.inputFile}`); + logger.info(`Output file: ${options.outputFile}`); + logger.info(`Label set id: ${options.labelSetId}`); + logger.info(`Label set version: ${options.labelSetVersion}`); + + const { RainbowRecordType, RainbowRecordCollectionType } = createRainbowProtobufRoot(); + const outputStream = setupWriteStream(options.outputFile); + + writeHeader( + outputStream, + RainbowRecordCollectionType, + options.labelSetId, + options.labelSetVersion, + ); + + logger.info("Reading and processing CSV file line by line with streaming..."); + + return { RainbowRecordType, outputStream }; +} + +/** + * Create rainbow record from parsed CSV columns + */ +function createRainbowRecord(parsedColumns: string[]): { labelhash: Buffer; label: string } { + const label = parsedColumns[0]; + + if (parsedColumns.length === 1) { + // Single column: compute labelhash using labelhash function + const labelHashBytes = labelHashToBytes(labelhash(label)); + return { + labelhash: Buffer.from(labelHashBytes), + label: label, + }; + } else { + // Two columns: validate and use provided hash + const [, providedHash] = parsedColumns; + const maybeLabelHash = providedHash.startsWith("0x") ? providedHash : `0x${providedHash}`; + const labelHash = labelHashToBytes(maybeLabelHash as LabelHash); + return { + labelhash: Buffer.from(labelHash), + label: label, + }; + } +} + +/** + * Process a single CSV record + */ +function processRecord( + line: string, + expectedColumns: number, + RainbowRecordType: any, + outputStream: NodeJS.WritableStream, +): void { + const parsedColumns = processStreamingCsvLine(line, expectedColumns); + const rainbowRecord = createRainbowRecord(parsedColumns); + + // Create protobuf message and write immediately + const recordMessage = RainbowRecordType.fromObject(rainbowRecord); + outputStream.write(Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish())); +} + +/** + * Process the entire CSV file + */ +async function processCSVFile( + rl: ReturnType, + RainbowRecordType: any, + outputStream: NodeJS.WritableStream, + progressInterval: number, +): Promise<{ totalLines: number; processedRecords: number }> { + let expectedColumns: number | null = null; + let lineNumber = 0; + let processedRecords = 0; + + for await (const line of rl) { + lineNumber++; + + // Skip empty lines + if (line.trim() === "") { + continue; + } + + try { + // For the first line, detect column count + if (expectedColumns === null) { + const firstLineParsed = parseCsvLine(line); + expectedColumns = firstLineParsed.length; + logger.info(`Detected ${expectedColumns} columns using csv-simple-parser`); + } + + processRecord(line, expectedColumns, RainbowRecordType, outputStream); + processedRecords++; + + // Log progress for large files + if (processedRecords % progressInterval === 0) { + logger.info(`Processed ${processedRecords} records so far...`); + } + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + throw new Error( + `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`, + ); + } + } + + return { totalLines: lineNumber, processedRecords }; +} + /** * Main CSV conversion command with true streaming using csv-simple-parser */ @@ -128,121 +243,44 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom const stats: ConversionStats = { totalLines: 0, processedRecords: 0, - skippedRecords: 0, - invalidLabels: 0, - duplicates: 0, startTime: new Date(), }; + let rl: ReturnType | null = null; + try { - logger.info("Starting conversion from CSV to protobuf format..."); - logger.info(`Input file: ${options.inputFile}`); - logger.info(`Output file: ${options.outputFile}`); - logger.info(`Label set id: ${options.labelSetId}`); - logger.info(`Label set version: ${options.labelSetVersion}`); + const { RainbowRecordType, outputStream } = initializeConversion(options); - // Setup protobuf schema - const { RainbowRecordType, RainbowRecordCollectionType } = createRainbowProtobufRoot(); + // Setup streaming CSV reader + rl = setupReadStream(options.inputFile); - // Setup streams - const outputStream = setupWriteStream(options.outputFile); + const progressInterval = options.progressInterval ?? DEFAULT_PROGRESS_INTERVAL; - // Write header - writeHeader( + // Process the CSV file + const { totalLines, processedRecords } = await processCSVFile( + rl, + RainbowRecordType, outputStream, - RainbowRecordCollectionType, - options.labelSetId, - options.labelSetVersion, + progressInterval, ); - logger.info("Reading and processing CSV file line by line with streaming..."); - - // Setup streaming CSV reader - const rl = setupReadStream(options.inputFile); - - let expectedColumns: number | null = null; - let lineNumber = 0; - let processedRecords = 0; - - // Process line by line with csv-simple-parser - for await (const line of rl) { - lineNumber++; - - // Skip empty lines - if (line.trim() === "") { - continue; - } - - try { - // For the first line, detect column count - if (expectedColumns === null) { - const firstLineParsed = parseCsvLine(line); - expectedColumns = firstLineParsed.length; - logger.info(`Detected ${expectedColumns} columns using csv-simple-parser`); - } - - // Parse current line with csv-simple-parser - const parsedColumns = processStreamingCsvLine(line, expectedColumns); - - // Get label (no validation - ENS accepts any UTF-8 string) - const label = parsedColumns[0]; - - // Build rainbow record immediately (streaming) - let rainbowRecord; - - if (parsedColumns.length === 1) { - // Single column: compute labelhash using labelhash function - const labelHashBytes = labelHashToBytes(labelhash(label)); - - rainbowRecord = { - labelhash: Buffer.from(labelHashBytes), - label: label, - }; - } else { - // Two columns: validate and use provided hash - const [, providedHash] = parsedColumns; - - // Ensure the hash has 0x prefix for labelHashToBytes - const maybeLabelHash = providedHash.startsWith("0x") ? providedHash : `0x${providedHash}`; - const labelHash = labelHashToBytes(maybeLabelHash as LabelHash); - - rainbowRecord = { - labelhash: Buffer.from(labelHash), - label: label, - }; - } - - // Create protobuf message and write immediately - const recordMessage = RainbowRecordType.fromObject(rainbowRecord); - outputStream.write(Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish())); - - processedRecords++; - - // Log progress for large files - if (processedRecords % 10000 === 0) { - logger.info(`Processed ${processedRecords} records so far...`); - } - } catch (error) { - const errorMessage = error instanceof Error ? error.message : String(error); - throw new Error( - `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`, - ); - } - } - - stats.totalLines = lineNumber; + stats.totalLines = totalLines; stats.processedRecords = processedRecords; // Close output stream outputStream.end(); logger.info(`✅ Processed ${processedRecords} records with streaming csv-simple-parser`); - logSummary(stats); logger.info("✅ CSV conversion completed successfully!"); } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); logger.error("❌ CSV conversion failed:", errorMessage); throw error; + } finally { + // Ensure readline interface is properly closed to prevent resource leaks + if (rl) { + rl.close(); + } } } From 4c18e0b904791a51fb1baf0d3092b58908361629 Mon Sep 17 00:00:00 2001 From: "kwrobel.eth" Date: Tue, 30 Sep 2025 14:29:40 +0200 Subject: [PATCH 03/30] Create brave-kiwis-notice.md --- .changeset/brave-kiwis-notice.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/brave-kiwis-notice.md diff --git a/.changeset/brave-kiwis-notice.md b/.changeset/brave-kiwis-notice.md new file mode 100644 index 000000000..fbdba8bfc --- /dev/null +++ b/.changeset/brave-kiwis-notice.md @@ -0,0 +1,5 @@ +--- +"ensrainbow": patch +--- + +feat: add CSV conversion command to ensrainbow CLI From 5aefe9dab4bff69fbcadf14879186838edd78184 Mon Sep 17 00:00:00 2001 From: djstrong Date: Wed, 1 Oct 2025 17:21:11 +0200 Subject: [PATCH 04/30] fix tests --- .../src/commands/convert-csv-command.test.ts | 54 ++++++++++++++----- .../src/commands/convert-csv-command.ts | 7 ++- .../test/fixtures/test_labels_1col.csv | 1 + .../test/fixtures/test_labels_2col.csv | 2 +- .../fixtures/test_labels_special_chars.csv | 3 +- 5 files changed, 47 insertions(+), 20 deletions(-) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index 2be46d924..16a6c5cdb 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -4,8 +4,10 @@ import { mkdtemp, rm, stat, writeFile } from "fs/promises"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { createCLI } from "@/cli"; -import { type LabelSetId, type LabelSetVersion } from "@ensnode/ensnode-sdk"; +import { labelHashToBytes, type LabelSetId, type LabelSetVersion } from "@ensnode/ensnode-sdk"; import { convertCsvCommand } from "./convert-csv-command"; +import { ENSRainbowDB } from "@/lib/database"; +import { labelhash } from "viem"; // Path to test fixtures const TEST_FIXTURES_DIR = join(__dirname, "..", "..", "test", "fixtures"); @@ -47,14 +49,13 @@ describe("convert-csv-command", () => { const cli = createCLI({ exitProcess: false }); await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]); - // Verify database was created - const dbStats = await stat(dataDir); - expect(dbStats.isDirectory()).toBe(true); - - // Verify database contents by validating it - await cli.parse(["validate", "--data-dir", dataDir, "--lite"]); - - // Database validation passed, which means records are accessible + const db = await ENSRainbowDB.open(dataDir); + expect(await db.validate()).toBe(true); + const recordsCount = await db.getPrecalculatedRainbowRecordCount(); + expect(recordsCount).toBe(11); + expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("123"))))?.label).toBe("123"); + expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234"))))).toBe(null); + await db.close(); }); it("should convert two column CSV with provided hashes and ingest successfully", async () => { @@ -79,9 +80,13 @@ describe("convert-csv-command", () => { const cli = createCLI({ exitProcess: false }); await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]); - // Verify database was created - const dbStats = await stat(dataDir); - expect(dbStats.isDirectory()).toBe(true); + const db = await ENSRainbowDB.open(dataDir); + expect(await db.validate()).toBe(true); + const recordsCount = await db.getPrecalculatedRainbowRecordCount(); + expect(recordsCount).toBe(10); + expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("test123"))))?.label).toBe("test123"); + expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234"))))).toBe(null); + await db.close(); }); it("should fail when CSV has inconsistent column count", async () => { @@ -99,9 +104,10 @@ describe("convert-csv-command", () => { ).rejects.toThrow(/CSV conversion failed due to invalid data/); }); - it("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => { + it.only("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => { const inputFile = join(TEST_FIXTURES_DIR, "test_labels_special_chars.csv"); const outputFile = join(tempDir, "output_special.ensrainbow"); + const dataDir = join(tempDir, "db_special"); // Convert CSV to ensrainbow format await convertCsvCommand({ @@ -119,7 +125,27 @@ describe("convert-csv-command", () => { // Verify special characters were processed correctly by checking logs // The conversion completed successfully, which means csv-simple-parser // handled emojis, unicode, quoted fields with commas, etc. - expect(true).toBe(true); // Test passes if conversion doesn't crash + + // Ingest the converted file into database + const cli = createCLI({ exitProcess: false }); + await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]); + + const db = await ENSRainbowDB.open(dataDir); + expect(await db.validate()).toBe(true); + const recordsCount = await db.getPrecalculatedRainbowRecordCount(); + expect(recordsCount).toBe(10); + const labels = [ + "🔥emoji-label🚀", + "special\"quotes\"inside", + "label with newline\n character", + "label-with-null\0byte", + ]; + for (const label of labels) { + expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash(label))))?.label).toBe(label); + } + expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234"))))).toBe(null); + await db.close(); + }); it("should fail when CSV contains invalid labelhash format", async () => { diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 0b4ed5d6b..7b08da655 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -20,15 +20,13 @@ import { * Parse CSV using csv-simple-parser with proper type safety */ function parseCsvLine(line: string): string[] { - const result = parse(line); + const result = parse(line, {optimistic: false}); if (result.length === 0) return []; const firstRow = result[0]; if (!Array.isArray(firstRow)) return []; - return firstRow.filter((item) => typeof item === "string"); + return firstRow.map((item) => String(item)); } -// No label validation - ENS accepts any UTF-8 string - export interface ConvertCsvCommandOptions { inputFile: string; outputFile: string; @@ -156,6 +154,7 @@ function createRainbowRecord(parsedColumns: string[]): { labelhash: Buffer; labe if (parsedColumns.length === 1) { // Single column: compute labelhash using labelhash function const labelHashBytes = labelHashToBytes(labelhash(label)); + console.log(label); return { labelhash: Buffer.from(labelHashBytes), label: label, diff --git a/apps/ensrainbow/test/fixtures/test_labels_1col.csv b/apps/ensrainbow/test/fixtures/test_labels_1col.csv index d809bd116..302ef8d63 100644 --- a/apps/ensrainbow/test/fixtures/test_labels_1col.csv +++ b/apps/ensrainbow/test/fixtures/test_labels_1col.csv @@ -8,3 +8,4 @@ governance hello world test123 +123 diff --git a/apps/ensrainbow/test/fixtures/test_labels_2col.csv b/apps/ensrainbow/test/fixtures/test_labels_2col.csv index f410bf758..e02a65762 100644 --- a/apps/ensrainbow/test/fixtures/test_labels_2col.csv +++ b/apps/ensrainbow/test/fixtures/test_labels_2col.csv @@ -1,7 +1,7 @@ alice,0x9c0257114eb9399a2985f8e75dad7600c5d89fe3824ffa99ec1c3eb8bf3b0501 bob,0x38e47a7b719dce63662aeaf43440326f551b8a7ee198cee35cb5d517f2d296a2 charlie,0x87a213ce1ee769e28decedefb98f6fe48890a74ba84957ebf877fb591e37e0de -domaintest,0xc2d1b32ab4268fbba175baa3dcab1eb8299bc784030b080f28eaf1b9336c0445 +domaintest,0x56827be2a1678c2593e2a613fe8c4138ec451ab019d70cd890e007f99b513be1 example,0x6fd43e7cffc31bb581d7421c8698e29aa2bd8e7186a394b85299908b4eb9b175 foundation,0x0d5c1bd818a4086f28314415cb375a937593efab66f8f7d2903bf2a13ed35070 governance,0xabea6fd3db56a6e6d0242111b43ebb13d1c42709651c032c7894962023a1f90a diff --git a/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv b/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv index a1cc2a55f..300cfc70a 100644 --- a/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv +++ b/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv @@ -1,6 +1,7 @@ 🔥emoji-label🚀 "label,with,commas" -"label with newline\n character" +"label with newline + character" Ąśćžłñ-unicode "label-with-null\0byte" "quoted label with spaces" From f2c8f20309c1d5e3f40c3ad8dc530e1200f697de Mon Sep 17 00:00:00 2001 From: djstrong Date: Wed, 1 Oct 2025 18:07:33 +0200 Subject: [PATCH 05/30] use fast-csv package --- apps/ensrainbow/package.json | 2 +- .../src/commands/convert-csv-command.test.ts | 29 +-- .../src/commands/convert-csv-command.ts | 175 +++++++----------- .../fixtures/test_labels_special_chars.csv | Bin 235 -> 234 bytes pnpm-lock.yaml | 60 ++++-- 5 files changed, 134 insertions(+), 132 deletions(-) diff --git a/apps/ensrainbow/package.json b/apps/ensrainbow/package.json index af46315e9..046cb2e2e 100644 --- a/apps/ensrainbow/package.json +++ b/apps/ensrainbow/package.json @@ -39,7 +39,7 @@ "protobufjs": "^7.4.0", "viem": "catalog:", "yargs": "^17.7.2", - "csv-simple-parser": "^2.0.2" + "@fast-csv/parse": "^5.0.0" }, "devDependencies": { "@ensnode/shared-configs": "workspace:*", diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index 16a6c5cdb..795e53bdc 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -4,10 +4,10 @@ import { mkdtemp, rm, stat, writeFile } from "fs/promises"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { createCLI } from "@/cli"; -import { labelHashToBytes, type LabelSetId, type LabelSetVersion } from "@ensnode/ensnode-sdk"; -import { convertCsvCommand } from "./convert-csv-command"; import { ENSRainbowDB } from "@/lib/database"; +import { type LabelSetId, type LabelSetVersion, labelHashToBytes } from "@ensnode/ensnode-sdk"; import { labelhash } from "viem"; +import { convertCsvCommand } from "./convert-csv-command"; // Path to test fixtures const TEST_FIXTURES_DIR = join(__dirname, "..", "..", "test", "fixtures"); @@ -53,8 +53,10 @@ describe("convert-csv-command", () => { expect(await db.validate()).toBe(true); const recordsCount = await db.getPrecalculatedRainbowRecordCount(); expect(recordsCount).toBe(11); - expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("123"))))?.label).toBe("123"); - expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234"))))).toBe(null); + expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("123"))))?.label).toBe( + "123", + ); + expect(await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234")))).toBe(null); await db.close(); }); @@ -84,8 +86,10 @@ describe("convert-csv-command", () => { expect(await db.validate()).toBe(true); const recordsCount = await db.getPrecalculatedRainbowRecordCount(); expect(recordsCount).toBe(10); - expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("test123"))))?.label).toBe("test123"); - expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234"))))).toBe(null); + expect( + (await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("test123"))))?.label, + ).toBe("test123"); + expect(await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234")))).toBe(null); await db.close(); }); @@ -104,7 +108,7 @@ describe("convert-csv-command", () => { ).rejects.toThrow(/CSV conversion failed due to invalid data/); }); - it.only("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => { + it("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => { const inputFile = join(TEST_FIXTURES_DIR, "test_labels_special_chars.csv"); const outputFile = join(tempDir, "output_special.ensrainbow"); const dataDir = join(tempDir, "db_special"); @@ -135,17 +139,18 @@ describe("convert-csv-command", () => { const recordsCount = await db.getPrecalculatedRainbowRecordCount(); expect(recordsCount).toBe(10); const labels = [ - "🔥emoji-label🚀", - "special\"quotes\"inside", + "🔥emoji-label🚀", + 'special"quotes"inside', "label with newline\n character", "label-with-null\0byte", ]; for (const label of labels) { - expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash(label))))?.label).toBe(label); + expect( + (await db.getVersionedRainbowRecord(labelHashToBytes(labelhash(label))))?.label, + ).toBe(label); } - expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234"))))).toBe(null); + expect(await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("1234")))).toBe(null); await db.close(); - }); it("should fail when CSV contains invalid labelhash format", async () => { diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 7b08da655..14ae2d4b3 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -1,14 +1,13 @@ /** * ENSRAINBOW CSV FILE CREATION COMMAND * - * Converts CSV files to .ensrainbow format with csv-simple-parser + * Converts CSV files to .ensrainbow format with fast-csv * Supports 1-column (label only) and 2-column (label,labelhash) formats */ import { createReadStream, createWriteStream } from "fs"; -import { createInterface } from "readline"; import { type LabelHash, labelHashToBytes } from "@ensnode/ensnode-sdk"; -import parse from "csv-simple-parser"; +import { parse } from "@fast-csv/parse"; import { labelhash } from "viem"; import { logger } from "../utils/logger.js"; import { @@ -16,17 +15,6 @@ import { createRainbowProtobufRoot, } from "../utils/protobuf-schema.js"; -/** - * Parse CSV using csv-simple-parser with proper type safety - */ -function parseCsvLine(line: string): string[] { - const result = parse(line, {optimistic: false}); - if (result.length === 0) return []; - const firstRow = result[0]; - if (!Array.isArray(firstRow)) return []; - return firstRow.map((item) => String(item)); -} - export interface ConvertCsvCommandOptions { inputFile: string; outputFile: string; @@ -45,37 +33,6 @@ interface ConversionStats { endTime?: Date; } -/** - * Process a single CSV line with csv-simple-parser and validation - */ -function processStreamingCsvLine(line: string, expectedColumns: number): string[] { - if (line.trim() === "") { - throw new Error("Empty line"); - } - - const parsedLine = parseCsvLine(line); - - // Validate column count - if (parsedLine.length !== expectedColumns) { - throw new Error( - `Expected ${expectedColumns} columns, but found ${parsedLine.length} in line: ${line}`, - ); - } - - return parsedLine; -} - -/** - * Setup input stream for reading CSV line by line - */ -function setupReadStream(inputFile: string) { - const fileStream = createReadStream(inputFile, { encoding: "utf8" }); - return createInterface({ - input: fileStream, - crlfDelay: Infinity, - }); -} - /** * Setup output stream for writing protobuf */ @@ -146,12 +103,12 @@ function initializeConversion(options: ConvertCsvCommandOptions) { } /** - * Create rainbow record from parsed CSV columns + * Create rainbow record from parsed CSV row */ -function createRainbowRecord(parsedColumns: string[]): { labelhash: Buffer; label: string } { - const label = parsedColumns[0]; +function createRainbowRecord(row: string[]): { labelhash: Buffer; label: string } { + const label = String(row[0]); - if (parsedColumns.length === 1) { + if (row.length === 1) { // Single column: compute labelhash using labelhash function const labelHashBytes = labelHashToBytes(labelhash(label)); console.log(label); @@ -161,7 +118,7 @@ function createRainbowRecord(parsedColumns: string[]): { labelhash: Buffer; labe }; } else { // Two columns: validate and use provided hash - const [, providedHash] = parsedColumns; + const providedHash = String(row[1]); const maybeLabelHash = providedHash.startsWith("0x") ? providedHash : `0x${providedHash}`; const labelHash = labelHashToBytes(maybeLabelHash as LabelHash); return { @@ -175,13 +132,20 @@ function createRainbowRecord(parsedColumns: string[]): { labelhash: Buffer; labe * Process a single CSV record */ function processRecord( - line: string, + row: string[], expectedColumns: number, RainbowRecordType: any, outputStream: NodeJS.WritableStream, + lineNumber: number, ): void { - const parsedColumns = processStreamingCsvLine(line, expectedColumns); - const rainbowRecord = createRainbowRecord(parsedColumns); + // Validate column count + if (row.length !== expectedColumns) { + throw new Error( + `Expected ${expectedColumns} columns, but found ${row.length} in line ${lineNumber}`, + ); + } + + const rainbowRecord = createRainbowRecord(row); // Create protobuf message and write immediately const recordMessage = RainbowRecordType.fromObject(rainbowRecord); @@ -189,54 +153,67 @@ function processRecord( } /** - * Process the entire CSV file + * Process the entire CSV file using fast-csv */ async function processCSVFile( - rl: ReturnType, + inputFile: string, RainbowRecordType: any, outputStream: NodeJS.WritableStream, progressInterval: number, ): Promise<{ totalLines: number; processedRecords: number }> { - let expectedColumns: number | null = null; - let lineNumber = 0; - let processedRecords = 0; - - for await (const line of rl) { - lineNumber++; - - // Skip empty lines - if (line.trim() === "") { - continue; - } - - try { - // For the first line, detect column count - if (expectedColumns === null) { - const firstLineParsed = parseCsvLine(line); - expectedColumns = firstLineParsed.length; - logger.info(`Detected ${expectedColumns} columns using csv-simple-parser`); - } - - processRecord(line, expectedColumns, RainbowRecordType, outputStream); - processedRecords++; - - // Log progress for large files - if (processedRecords % progressInterval === 0) { - logger.info(`Processed ${processedRecords} records so far...`); - } - } catch (error) { - const errorMessage = error instanceof Error ? error.message : String(error); - throw new Error( - `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`, - ); - } - } - - return { totalLines: lineNumber, processedRecords }; + return new Promise((resolve, reject) => { + let expectedColumns: number | null = null; + let lineNumber = 0; + let processedRecords = 0; + + const fileStream = createReadStream(inputFile, { encoding: "utf8" }); + + const csvStream = parse() + .on("data", (row: string[]) => { + lineNumber++; + + try { + // For the first row, detect column count + if (expectedColumns === null) { + expectedColumns = row.length; + logger.info(`Detected ${expectedColumns} columns using fast-csv`); + } + + processRecord(row, expectedColumns, RainbowRecordType, outputStream, lineNumber); + processedRecords++; + + // Log progress for large files + if (processedRecords % progressInterval === 0) { + logger.info(`Processed ${processedRecords} records so far...`); + } + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + csvStream.destroy(); + fileStream.destroy(); + reject( + new Error( + `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`, + ), + ); + } + }) + .on("error", (error: Error) => { + reject(new Error(`CSV parsing error: ${error.message}`)); + }) + .on("end", () => { + resolve({ totalLines: lineNumber, processedRecords }); + }); + + fileStream + .on("error", (error: Error) => { + reject(error); + }) + .pipe(csvStream); + }); } /** - * Main CSV conversion command with true streaming using csv-simple-parser + * Main CSV conversion command with true streaming using fast-csv */ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Promise { const stats: ConversionStats = { @@ -245,19 +222,14 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom startTime: new Date(), }; - let rl: ReturnType | null = null; - try { const { RainbowRecordType, outputStream } = initializeConversion(options); - // Setup streaming CSV reader - rl = setupReadStream(options.inputFile); - const progressInterval = options.progressInterval ?? DEFAULT_PROGRESS_INTERVAL; // Process the CSV file const { totalLines, processedRecords } = await processCSVFile( - rl, + options.inputFile, RainbowRecordType, outputStream, progressInterval, @@ -269,17 +241,12 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom // Close output stream outputStream.end(); - logger.info(`✅ Processed ${processedRecords} records with streaming csv-simple-parser`); + logger.info(`✅ Processed ${processedRecords} records with streaming fast-csv`); logSummary(stats); logger.info("✅ CSV conversion completed successfully!"); } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); logger.error("❌ CSV conversion failed:", errorMessage); throw error; - } finally { - // Ensure readline interface is properly closed to prevent resource leaks - if (rl) { - rl.close(); - } } } diff --git a/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv b/apps/ensrainbow/test/fixtures/test_labels_special_chars.csv index 300cfc70a9f1230c7346e7b38832f742eb463706..ac2a1f80d8fad7fafbcde1febbe21d95dd15e545 100644 GIT binary patch delta 11 ScmaFO_=<5tE+fOl{2Blp00dG1 delta 12 TcmaFG_?mG-E>n!b#Jm~+A!G#J diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8c8c0b79b..3dea391e0 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -456,15 +456,15 @@ importers: '@ensnode/ensrainbow-sdk': specifier: workspace:* version: link:../../packages/ensrainbow-sdk + '@fast-csv/parse': + specifier: ^5.0.0 + version: 5.0.5 '@hono/node-server': specifier: ^1.4.1 version: 1.19.5(hono@4.10.3) classic-level: specifier: ^1.4.1 version: 1.4.1 - csv-simple-parser: - specifier: ^2.0.2 - version: 2.0.2 hono: specifier: 'catalog:' version: 4.10.3 @@ -1518,6 +1518,9 @@ packages: '@expressive-code/plugin-text-markers@0.41.3': resolution: {integrity: sha512-SN8tkIzDpA0HLAscEYD2IVrfLiid6qEdE9QLlGVSxO1KEw7qYvjpbNBQjUjMr5/jvTJ7ys6zysU2vLPHE0sb2g==} + '@fast-csv/parse@5.0.5': + resolution: {integrity: sha512-M0IbaXZDbxfOnpVE5Kps/a6FGlILLhtLsvWd9qNH3d2TxNnpbNkFf3KD26OmJX6MHq7PdQAl5htStDwnuwHx6w==} + '@fastify/busboy@3.2.0': resolution: {integrity: sha512-m9FVDXU3GT2ITSe0UaMA5rU3QkfC/UXtCU8y0gSN/GugTqtVldOBWIB5V6V3sbmenVZUIpU6f+mPEO2+m5iTaA==} @@ -4164,9 +4167,6 @@ packages: csstype@3.2.3: resolution: {integrity: sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==} - csv-simple-parser@2.0.2: - resolution: {integrity: sha512-G9KUSB7Bh8mRjZcg340FJM96tJYPPfb+UjR6T+dOcdRLChmwOTP6jB9+rJwmqDoaPHMJW/CXabYbJ1ZEjbkrrg==} - cytoscape-cose-bilkent@4.1.0: resolution: {integrity: sha512-wgQlVIUJF13Quxiv5e1gstZ08rnZj2XaLHGoFMYXz7SkNfCDOOteKBE6SYRfA9WxxI/iBc3ajfDoc6hb/MRAHQ==} peerDependencies: @@ -4410,9 +4410,6 @@ packages: destr@2.0.5: resolution: {integrity: sha512-ugFTXCtDZunbzasqBxrK93Ik/DRYsO6S/fedkWEMKqt04xZ4csmnmwGDBAb07QWNaGMAmnTIemsYZCksjATwsA==} - detect-eol@3.0.1: - resolution: {integrity: sha512-ncnuLiZCKO7Kt+3CpwUIV8QnnwpBsSFxGQBY6Nve18K2aOrTim2xpzDa8YunHkePt39OCfV2qOX+b7xjYSDRWg==} - detect-indent@6.1.0: resolution: {integrity: sha512-reYkTUJAZb9gUuZ2RvVCNhVHdg62RHnJ7WJl8ftMi4diZ6NWlciOzQN88pUhSELEwflJht4oQDv0F0BMlwaYtA==} engines: {node: '>=8'} @@ -5480,12 +5477,30 @@ packages: lodash.debounce@4.0.8: resolution: {integrity: sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==} + lodash.escaperegexp@4.1.2: + resolution: {integrity: sha512-TM9YBvyC84ZxE3rgfefxUWiQKLilstD6k7PTGt6wfbtXF8ixIJLOL3VYyV/z+ZiPLsVxAsKAFVwWlWeb2Y8Yyw==} + + lodash.groupby@4.6.0: + resolution: {integrity: sha512-5dcWxm23+VAoz+awKmBaiBvzox8+RqMgFhi7UvX9DHZr2HdxHXM/Wrf8cfKpsW37RNrvtPn6hSwNqurSILbmJw==} + + lodash.isfunction@3.0.9: + resolution: {integrity: sha512-AirXNj15uRIMMPihnkInB4i3NHeb4iBtNg9WRWuK2o31S+ePwwNmDPaTL3o7dTJ+VXNZim7rFs4rxN4YU1oUJw==} + + lodash.isnil@4.0.0: + resolution: {integrity: sha512-up2Mzq3545mwVnMhTDMdfoG1OurpA/s5t88JmQX809eH3C8491iu2sfKhTfhQtKY78oPNhiaHJUpT/dUDAAtng==} + + lodash.isundefined@3.0.1: + resolution: {integrity: sha512-MXB1is3s899/cD8jheYYE2V9qTHwKvt+npCwpD+1Sxm3Q3cECXCiYHjeHWXNwr6Q0SOBPrYUDxendrO6goVTEA==} + lodash.sortby@4.7.0: resolution: {integrity: sha512-HDWXG8isMntAyRF5vZ7xKuEvOhT4AhlRt/3czTSjvGUxjYCBVRQY48ViDHyfYz9VIoBkW4TMGQNapx+l3RUwdA==} lodash.startcase@4.4.0: resolution: {integrity: sha512-+WKqsK294HMSc2jEbNgpHpd0JfIBhp7rEV4aqXWqFr6AlXov+SlcgB1Fv01y2kGe3Gc8nMW7VA0SrGuSkRfIEg==} + lodash.uniq@4.5.0: + resolution: {integrity: sha512-xfBaXQd9ryd9dlSDvnvI0lvxfLJlYAZzXomUYzLKtUeOQvOP5piqAWuGtrhWeqaXK9hhoM/iyJc5AV+XfsX3HQ==} + lodash@4.17.21: resolution: {integrity: sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==} @@ -8687,6 +8702,15 @@ snapshots: dependencies: '@expressive-code/core': 0.41.3 + '@fast-csv/parse@5.0.5': + dependencies: + lodash.escaperegexp: 4.1.2 + lodash.groupby: 4.6.0 + lodash.isfunction: 3.0.9 + lodash.isnil: 4.0.0 + lodash.isundefined: 3.0.1 + lodash.uniq: 4.5.0 + '@fastify/busboy@3.2.0': {} '@floating-ui/core@1.7.3': @@ -11671,10 +11695,6 @@ snapshots: csstype@3.2.3: {} - csv-simple-parser@2.0.2: - dependencies: - detect-eol: 3.0.1 - cytoscape-cose-bilkent@4.1.0(cytoscape@3.33.1): dependencies: cose-base: 1.0.3 @@ -11918,8 +11938,6 @@ snapshots: destr@2.0.5: {} - detect-eol@3.0.1: {} - detect-indent@6.1.0: {} detect-libc@2.1.2: {} @@ -13029,10 +13047,22 @@ snapshots: lodash.debounce@4.0.8: {} + lodash.escaperegexp@4.1.2: {} + + lodash.groupby@4.6.0: {} + + lodash.isfunction@3.0.9: {} + + lodash.isnil@4.0.0: {} + + lodash.isundefined@3.0.1: {} + lodash.sortby@4.7.0: {} lodash.startcase@4.4.0: {} + lodash.uniq@4.5.0: {} + lodash@4.17.21: {} long@5.3.2: {} From e20932db1e0c53549aa1a35aecd5eb76be8564cc Mon Sep 17 00:00:00 2001 From: djstrong Date: Mon, 6 Oct 2025 16:44:32 +0200 Subject: [PATCH 06/30] add documentation for csv convert --- .../src/commands/convert-csv-command.test.ts | 8 +- .../ensrainbow/concepts/creating-files.mdx | 593 ++++++++++++++++++ .../docs/ensrainbow/concepts/data-model.mdx | 11 +- .../docs/ensrainbow/contributing/index.mdx | 5 +- 4 files changed, 604 insertions(+), 13 deletions(-) create mode 100644 docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index 795e53bdc..58c7af900 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -126,10 +126,6 @@ describe("convert-csv-command", () => { expect(outputStats.isFile()).toBe(true); expect(outputStats.size).toBeGreaterThan(0); - // Verify special characters were processed correctly by checking logs - // The conversion completed successfully, which means csv-simple-parser - // handled emojis, unicode, quoted fields with commas, etc. - // Ingest the converted file into database const cli = createCLI({ exitProcess: false }); await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]); @@ -141,8 +137,8 @@ describe("convert-csv-command", () => { const labels = [ "🔥emoji-label🚀", 'special"quotes"inside', - "label with newline\n character", - "label-with-null\0byte", + "label with newline\n character", // new line + "label-with-null\0byte", // null byte ]; for (const label of labels) { expect( diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx new file mode 100644 index 000000000..f2c9c34cf --- /dev/null +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -0,0 +1,593 @@ +--- +title: Creating ENSRainbow Files +description: Complete guide to creating .ensrainbow files from SQL dumps and CSV data. +sidebar: + label: Creating Files + order: 3 +keywords: [ensrainbow, file creation, conversion, sql, csv] +--- + +ENSRainbow provides two methods for creating `.ensrainbow` files from different data sources. This guide helps you choose the right method and provides step-by-step instructions. + +## Prerequisites + +Before creating `.ensrainbow` files, ensure you have: + +1. **ENSNode repository cloned**: + ```bash + git clone https://github.com/namehash/ensnode.git + cd ensnode + ``` + +2. **Dependencies installed**: + ```bash + pnpm install + ``` + +3. **Working directory**: Navigate to the ENSRainbow directory: + ```bash + cd apps/ensrainbow + ``` + +All commands in this guide assume you're in the `apps/ensrainbow` directory unless otherwise specified. + +## Overview + +A `.ensrainbow` file is ENSRainbow's binary format for storing label-to-labelhash mappings. It uses Protocol Buffers for efficient serialization and supports streaming for large datasets. + +For detailed information about the file format structure, see the [Data Model](/ensrainbow/concepts/data-model) documentation. + +## Choosing Your Conversion Method + +| Method | Input Format | Use Case | Command | +|--------|-------------|----------|---------| +| **SQL Conversion** | Gzipped SQL dump (`ens_names.sql.gz`) | Converting legacy ENS Subgraph data | `pnpm run convert` | +| **CSV Conversion** | CSV file (1 or 2 columns) | Custom datasets, test data, external sources | `pnpm run convert-csv` | + +### When to Use SQL Conversion + +- Converting existing ENS Subgraph rainbow tables +- Working with legacy `ens_names.sql.gz` files +- Migrating from previous ENS data formats + +### When to Use CSV Conversion + +- Creating test datasets +- Converting data from external sources +- Working with custom label collections +- Building incremental label sets + +## Method 1: Converting from SQL Dumps + +The `convert` command processes gzipped SQL dump files from the ENS Subgraph. + +### Command Syntax + +```bash +pnpm run convert \ + --input-file \ + --output-file \ + --label-set-id \ + --label-set-version +``` + +### Required Parameters + +- `--input-file`: Path to the gzipped SQL dump file +- `--label-set-id`: Identifier for the label set (e.g., `subgraph`, `discovery-a`) +- `--label-set-version`: Version number for the label set (non-negative integer) + +### Optional Parameters + +- `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`) + +### Example: Converting ENS Subgraph Data + +```bash +# Convert main ENS Subgraph data +pnpm run convert \ + --input-file ens_names.sql.gz \ + --output-file subgraph_0.ensrainbow \ + --label-set-id subgraph \ + --label-set-version 0 +``` + +### Example: Converting Test Data + +```bash +# Convert ens-test-env data +pnpm run convert \ + --input-file test/fixtures/ens_test_env_names.sql.gz \ + --output-file ens-test-env_0.ensrainbow \ + --label-set-id ens-test-env \ + --label-set-version 0 +``` + +### How It Works + +1. **Streams** the gzipped SQL file to avoid memory issues +2. **Parses** SQL COPY statements to extract label/labelhash pairs +3. **Validates** each record and skips invalid entries +4. **Writes** protobuf messages with length-delimited encoding +5. **Creates** a header message followed by individual record messages + +## Method 2: Converting from CSV Files + +The `convert-csv` command processes CSV files with flexible column formats. + +### Command Syntax + +```bash +pnpm run convert-csv \ + --input-file \ + --output-file \ + --label-set-id \ + --label-set-version \ + [--progress-interval ] +``` + +### Required Parameters + +- `--input-file`: Path to the CSV file +- `--label-set-id`: Identifier for the label set +- `--label-set-version`: Version number for the label set + +### Optional Parameters + +- `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`) +- `--progress-interval`: Progress logging frequency (default: 10000 records) + +### CSV Format Support + +The CSV converter supports two formats: + +#### Single Column Format (Label Only) +```csv +ethereum +vitalik +ens +``` + +The converter automatically computes labelhashes using the `labelhash()` function. + +#### Two Column Format (Label + Labelhash) +```csv +ethereum,0x541111248b45b7a8dc3f5579f630e74cb01456ea6ac067d3f4d793245a255155 +vitalik,0xaf2caa1c2ca1d027f1ac823b529d0a67cd144264b2789fa2ea4d63a67c7103cc +ens,0x5cee339e13375638553bdf5a6e36ba80fb9f6a4f0783680884d92b558aa471da +``` + +The converter validates that provided labelhashes match the computed hash for each label. + +### Example: Creating Test Dataset + +```bash +# Create test dataset from CSV +pnpm run convert-csv \ + --input-file test-labels.csv \ + --output-file test-dataset_0.ensrainbow \ + --label-set-id test-dataset \ + --label-set-version 0 +``` + +### Example: Creating Discovery Dataset + +```bash +# Create discovery dataset (initially empty) +echo "" > empty.csv +pnpm run convert-csv \ + --input-file empty.csv \ + --output-file discovery-a_0.ensrainbow \ + --label-set-id discovery-a \ + --label-set-version 0 +``` + +### How It Works + +1. **Detects** CSV format automatically (1 or 2 columns) +2. **Streams** CSV parsing using fast-csv for memory efficiency +3. **Validates** column count and data format +4. **Computes** or validates labelhashes as needed +5. **Writes** protobuf messages with the same format as SQL conversion + +## Common Workflows + +### Workflow 1: Migrating from ENS Subgraph + +```bash +# 1. Convert SQL dump to .ensrainbow +pnpm run convert \ + --input-file ens_names.sql.gz \ + --output-file subgraph_0.ensrainbow \ + --label-set-id subgraph \ + --label-set-version 0 + +# 2. Ingest into LevelDB +pnpm run ingest-ensrainbow \ + --input-file subgraph_0.ensrainbow \ + --data-dir data-subgraph + +# 3. Validate the database +pnpm run validate --data-dir data-subgraph + +# 4. Start the API server +pnpm run serve --data-dir data-subgraph --port 3223 +``` + +### Workflow 2: Creating Test Environment + +```bash +# 1. Convert test data +pnpm run convert \ + --input-file test/fixtures/ens_test_env_names.sql.gz \ + --output-file ens-test-env_0.ensrainbow \ + --label-set-id ens-test-env \ + --label-set-version 0 + +# 2. Ingest test data +pnpm run ingest-ensrainbow \ + --input-file ens-test-env_0.ensrainbow \ + --data-dir data-test-env + +# 3. Run with test data +pnpm run serve --data-dir data-test-env --port 3223 +``` + +### Workflow 3: Building Custom Dataset + +```bash +# 1. Create CSV with your labels +echo "mylabel1 +mylabel2 +mylabel3" > custom-labels.csv + +# 2. Convert to .ensrainbow +pnpm run convert-csv \ + --input-file custom-labels.csv \ + --output-file custom_0.ensrainbow \ + --label-set-id custom \ + --label-set-version 0 + +# 3. Ingest and serve +pnpm run ingest-ensrainbow \ + --input-file custom_0.ensrainbow \ + --data-dir data-custom + +pnpm run serve --data-dir data-custom --port 3223 +``` + +### Workflow 4: Using Custom Label Set Server + +```bash +# 1. Configure custom label set server +export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com" + +# 2. Download from custom server +# The script downloads to labelsets/ subdirectory +./scripts/download-ensrainbow-files.sh my-dataset 0 + +# 3. Ingest and serve +# Files are downloaded to labelsets/ by the script +pnpm run ingest-ensrainbow \ + --input-file labelsets/my-dataset_0.ensrainbow \ + --data-dir data-my-dataset + +pnpm run serve --data-dir data-my-dataset --port 3223 +``` + +:::note[Script Output Locations] +ENSRainbow download scripts save files to specific subdirectories: +- **`.ensrainbow` files**: `labelsets/` +- **Database archives**: `databases/{schema_version}/` +- **Checksums and licenses**: Same directory as the downloaded file +::: + +## File Naming Conventions + +Follow the naming convention: `{label-set-id}_{label-set-version}.ensrainbow` + +**Examples:** +- `subgraph_0.ensrainbow` - Main ENS data, version 0 +- `subgraph_1.ensrainbow` - Main ENS data, version 1 (incremental update) +- `discovery-a_0.ensrainbow` - Discovery dataset, version 0 +- `ens-test-env_0.ensrainbow` - Test environment data, version 0 + +## Next Steps + +After creating your `.ensrainbow` file: + +1. **[Ingest the data](/ensrainbow/contributing/index#data-ingestion-ingest-ensrainbow)** into a ENSRainbow database +2. **[Validate the database](/ensrainbow/contributing/index#database-validation-validate)** to ensure integrity +3. **[Start the API server](/ensrainbow/contributing/index#api-server-serve)** to serve the data + +For complete CLI reference information, see the [CLI Reference](/ensrainbow/contributing/cli-reference) documentation. + +## Creating and Publishing Custom .ensrainbow Files + +If you want to create, publish, and distribute your own `.ensrainbow` files, follow these steps: + +### 1. Create Your Dataset + +First, prepare your data in either SQL or CSV (recommended) format, then convert it using the appropriate method: + +```bash +# For CSV data +pnpm run convert-csv \ + --input-file my-labels.csv \ + --output-file my-dataset_0.ensrainbow \ + --label-set-id my-dataset \ + --label-set-version 0 + +# For SQL data +pnpm run convert \ + --input-file my-data.sql.gz \ + --output-file my-dataset_0.ensrainbow \ + --label-set-id my-dataset \ + --label-set-version 0 +``` + +### 2. Validate Your File + +Test your `.ensrainbow` file by ingesting it locally: + +```bash +# Ingest your custom dataset +pnpm run ingest-ensrainbow \ + --input-file my-dataset_0.ensrainbow \ + --data-dir data-my-dataset + +# Validate the database +pnpm run validate --data-dir data-my-dataset + +# Test the API +pnpm run serve --data-dir data-my-dataset --port 3223 +``` + +### 3. Publish Your File + +#### Option A: Direct File Sharing +- Upload your `.ensrainbow` file to a web server or cloud storage +- Provide a direct download URL +- Share checksums for integrity verification + +#### Option B: Package as Database Archive +For better performance, package your data as a pre-built database: + +```bash +# Ingest your .ensrainbow file +pnpm run ingest-ensrainbow \ + --input-file my-dataset_0.ensrainbow \ + --data-dir data-my-dataset + +# Package the database +tar -czvf my-dataset_0.tgz ./data-my-dataset + +# Calculate checksum +sha256sum my-dataset_0.tgz > my-dataset_0.tgz.sha256sum +``` + +### 4. Document Your Label Set + +Create documentation for your custom label set including: + +- **Label Set ID**: The identifier users will specify +- **Description**: What labels are included and their source +- **Version**: Current version number +- **Download URLs**: Where to get the files +- **Checksums**: For integrity verification +- **Usage Examples**: How to use your dataset + +### Example Documentation Format + +```markdown +## Custom Label Set: my-dataset + +**Label Set ID**: `my-dataset` +**Current Version**: `0` +**Description**: Custom ENS labels from [source description] + +### Download +- Database Archive: `https://example.com/my-dataset_0.tgz` +- Checksum: `https://example.com/my-dataset_0.tgz.sha256sum` + +### Usage +```bash +# Using with Docker +docker run -d \ + -e DB_SCHEMA_VERSION="3" \ + -e LABEL_SET_ID="my-dataset" \ + -e LABEL_SET_VERSION="0" \ + -p 3223:3223 \ + ghcr.io/namehash/ensnode/ensrainbow:latest +``` + +## Setting Up Your Own Label Set Server + +A **Label Set Server** is a storage and hosting service for `.ensrainbow` files and prebuilt database archives. It's not the ENSRainbow API server itself, but rather a way to distribute your custom datasets for others to download and use. + +### 1. Choose Your Hosting Platform + +You can host your label set files on any web server or cloud storage service: + +- **AWS S3**: Industry standard with versioning +- **Cloudflare R2**: Cost-effective alternative to S3 +- **Simple HTTP server**: For internal/private use + +### 2. Organize Your Files + +Structure your label set files following ENSRainbow conventions: + +``` +my-label-set-server/ +├── labelsets/ +│ ├── my-dataset_0.ensrainbow +│ ├── my-dataset_0.ensrainbow.sha256sum +│ ├── my-dataset_1.ensrainbow +│ └── my-dataset_1.ensrainbow.sha256sum +└── databases/ + ├── 3/ # Schema version + │ ├── my-dataset_0.tgz + │ ├── my-dataset_0.tgz.sha256sum + │ ├── my-dataset_1.tgz + │ └── my-dataset_1.tgz.sha256sum + └── 4/ # Future schema version +``` + +### 3. Use Existing Download Scripts + +ENSRainbow provides ready-to-use download scripts that users can configure to download from your label set server: + +#### Download .ensrainbow Files +```bash +# Configure your label set server URL +export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com" + +# Download .ensrainbow file using the existing script +./scripts/download-ensrainbow-files.sh my-dataset 0 +``` + +#### Download Prebuilt Database Archives +```bash +# Configure your label set server URL +export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com" + +# Download prebuilt database using the existing script +./scripts/download-prebuilt-database.sh 3 my-dataset 0 +``` + +#### Script Features +The existing scripts automatically handle: +- **Checksum verification** for data integrity +- **Resume downloads** if files already exist and are valid +- **License file downloads** (optional) +- **Progress reporting** for large files +- **Error handling** with cleanup of partial downloads + +### 4. Document Your Label Set Server + +Create a README or documentation page for your label set server: + +```markdown +# My Label Set Server + +This server hosts custom ENS label sets for ENSRainbow. + +## Available Label Sets + +### my-dataset +- **Description**: Custom ENS labels from [source] +- **Versions**: 0, 1 +- **Schema Versions**: 3 +- **Base URL**: `https://my-label-set-server.com` + +### another-dataset +- **Description**: Additional labels from [source] +- **Versions**: 0 +- **Schema Versions**: 3 +- **Base URL**: `https://my-label-set-server.com` +``` + +## Usage + +Users should have the ENSNode repository cloned and be in the `apps/ensrainbow` directory. + +### Option 1: Download .ensrainbow Files + +```bash +# Configure your label set server +export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com" + +# Download .ensrainbow file +./scripts/download-ensrainbow-files.sh my-dataset 0 + +# Ingest into ENSRainbow +pnpm run ingest-ensrainbow \ + --input-file labelsets/my-dataset_0.ensrainbow \ + --data-dir data-my-dataset + +# Start ENSRainbow server +pnpm run serve --data-dir data-my-dataset --port 3223 +``` + +### Option 2: Download Prebuilt Databases (Faster) + +```bash +# Configure your label set server +export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com" + +# Download prebuilt database +./scripts/download-prebuilt-database.sh 3 my-dataset 0 + +# Extract database +tar -xzf databases/3/my-dataset_0.tgz -C data-my-dataset --strip-components=1 + +# Start ENSRainbow server +pnpm run serve --data-dir data-my-dataset --port 3223 +``` + +### 5. Version Management + +Implement proper versioning for your label sets: + +```bash +# When releasing a new version +LABEL_SET_ID="my-dataset" +NEW_VERSION="1" + +# Create new .ensrainbow file +pnpm run convert-csv \ + --input-file updated-labels.csv \ + --output-file ${LABEL_SET_ID}_${NEW_VERSION}.ensrainbow \ + --label-set-id ${LABEL_SET_ID} \ + --label-set-version ${NEW_VERSION} + +# Create prebuilt database +pnpm run ingest-ensrainbow \ + --input-file ${LABEL_SET_ID}_${NEW_VERSION}.ensrainbow \ + --data-dir data-${LABEL_SET_ID}-${NEW_VERSION} + +tar -czvf ${LABEL_SET_ID}_${NEW_VERSION}.tgz ./data-${LABEL_SET_ID}-${NEW_VERSION} + +# Calculate checksums +sha256sum ${LABEL_SET_ID}_${NEW_VERSION}.ensrainbow > ${LABEL_SET_ID}_${NEW_VERSION}.ensrainbow.sha256sum +sha256sum ${LABEL_SET_ID}_${NEW_VERSION}.tgz > ${LABEL_SET_ID}_${NEW_VERSION}.tgz.sha256sum + +# Upload to your label set server +# (implementation depends on your hosting platform) +``` + +### 6. Testing Your Label Set Server + +Before publishing, test that your label set server works correctly: + +```bash +# Set your test server URL +export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com" + +# Test downloading .ensrainbow file +./scripts/download-ensrainbow-files.sh my-dataset 0 + +# Verify checksum was validated +# The script will fail if checksums don't match + +# Test downloading prebuilt database +./scripts/download-prebuilt-database.sh 3 my-dataset 0 + +# Verify the database works +pnpm run ingest-ensrainbow \ + --input-file labelsets/my-dataset_0.ensrainbow \ + --data-dir test-data + +pnpm run validate --data-dir test-data +``` + +## Running Your Own ENSRainbow Server + +If you want to run your own ENSRainbow API server (separate from the label set server), see the [Local Development](/ensrainbow/contributing/local-development) guide for instructions on setting up and running ENSRainbow locally or in production. + +## Related Documentation + +- **[Data Model](/ensrainbow/concepts/data-model)** - Understanding the `.ensrainbow` file format +- **[Label Sets & Versioning](/ensrainbow/concepts/label-sets-and-versioning)** - Managing label set versions +- **[CLI Reference](/ensrainbow/contributing/cli-reference)** - Complete command documentation +- **[Local Development](/ensrainbow/contributing/local-development)** - Setting up your development environment diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx index 8978ca5a9..e1df686d0 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx @@ -104,15 +104,14 @@ subgraph_0.ensrainbow # labelSetId = "subgraph", version = 0 subgraph_1.ensrainbow # next version with incremental labelhash-to-label mappings added ``` -## Converting Legacy SQL Data +## Creating ENSRainbow Files -If you have a legacy gzipped rainbow table (`ens_names.sql.gz`) from the ENS Subgraph, you can convert it to the `.ensrainbow` format: +ENSRainbow provides two methods for creating `.ensrainbow` files from different data sources: -```bash title="Convert legacy SQL data" -pnpm run convert --input-file path/to/ens_names.sql.gz --output-file subgraph-0.ensrainbow -``` +- **SQL Conversion**: Convert legacy ENS Subgraph data (`ens_names.sql.gz`) using `pnpm run convert` +- **CSV Conversion**: Convert custom datasets from CSV files using `pnpm run convert-csv` -This conversion process allows you to migrate existing rainbow table data that was previously stored in SQL format to ENSRainbow's optimized binary format. The resulting `.ensrainbow` file will be equivalent to the rainbow tables used by the ENS Subgraph, maintaining the same label-to-labelhash mappings while providing better performance and storage efficiency. +For complete instructions, examples, and workflow guidance, see the [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files) guide. ## Ingestion Process diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx index 64556f1eb..401a0f986 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx @@ -17,6 +17,7 @@ This guide covers running ENSRainbow locally for development and contributions. For focused guidance on specific topics, check out these dedicated pages: + @@ -24,6 +25,7 @@ For focused guidance on specific topics, check out these dedicated pages: :::tip[Choose Your Path] - **New to the project?** Start with [Local Development](/ensrainbow/contributing/local-development) +- **Creating custom datasets?** See [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files) - **Need CLI help?** Check the [CLI Reference](/ensrainbow/contributing/cli-reference) - **Building for production?** See [Building Docker Images](/ensrainbow/contributing/building) ::: @@ -41,6 +43,7 @@ Follow these steps to start contributing to ENSRainbow: ## Quick Reference - **Need to build from source?** → [Building Docker Images](/ensrainbow/contributing/building) +- **Creating custom datasets?** → [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files) - **Looking for CLI commands?** → [CLI Reference](/ensrainbow/contributing/cli-reference) - **Running into issues?** → [Troubleshooting](/ensrainbow/usage/troubleshooting) - **Want to understand the data flow?** → [Data Model](/ensrainbow/concepts/data-model) @@ -265,7 +268,7 @@ These steps are typically performed by project maintainers for releasing officia ### 1. Prepare `.ensrainbow` Files -This section covers the conversion of source data (like SQL dumps or empty files for initial datasets) into the `.ensrainbow` format. The `time` command is used here to measure the duration of potentially long-running conversion processes. +This section covers the conversion of source data (like SQL dumps or empty files for initial datasets) into the `.ensrainbow` format. For detailed conversion instructions and examples, see the [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files) guide. **For the `subgraph` Label Set (main dataset):** This command converts a SQL dump file (`ens_names.sql.gz`) into an `.ensrainbow` file for version 0 of the `subgraph` Label Set. From b9c31b08422a1b71100bcec7ac2940a11bb5e35b Mon Sep 17 00:00:00 2001 From: djstrong Date: Fri, 17 Oct 2025 22:45:34 +0200 Subject: [PATCH 07/30] feat: add filtering capabilities to CSV conversion - Introduced `--existing-db-path` option to filter out existing labels from an ENSRainbow database during CSV conversion. - Enhanced conversion process to skip duplicate labels within the same CSV file. - Updated logging to include statistics on filtered labels. - Added comprehensive tests for filtering functionality and updated documentation to reflect new features. --- apps/ensrainbow/src/cli.ts | 6 + .../src/commands/convert-csv-command.test.ts | 189 ++++++++++++++++++ .../src/commands/convert-csv-command.ts | 111 +++++++++- .../ensrainbow/concepts/creating-files.mdx | 86 +++++++- 4 files changed, 379 insertions(+), 13 deletions(-) diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index 940692729..d9d38c4f9 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -68,6 +68,7 @@ interface ConvertCsvArgs { "label-set-id": LabelSetId; "label-set-version": LabelSetVersion; "progress-interval"?: number; + "existing-db-path"?: string; } export interface CLIOptions { @@ -259,6 +260,10 @@ export function createCLI(options: CLIOptions = {}) { type: "number", description: "Number of records to process before logging progress", default: 10000, + }) + .option("existing-db-path", { + type: "string", + description: "Path to existing ENSRainbow database to filter out existing labels", }); }, async (argv: ArgumentsCamelCase) => { @@ -268,6 +273,7 @@ export function createCLI(options: CLIOptions = {}) { labelSetId: argv["label-set-id"], labelSetVersion: argv["label-set-version"], progressInterval: argv["progress-interval"], + existingDbPath: argv["existing-db-path"], }); }, ) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index 58c7af900..9e2569ab2 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -216,6 +216,195 @@ describe("convert-csv-command", () => { }); }); + describe("Filtering functionality", () => { + it("should filter out labels that already exist in the database", async () => { + const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv"); + const outputFile = join(tempDir, "output_filtered.ensrainbow"); + const dataDir = join(tempDir, "db_filtered"); + + // First, create an initial database with some labels + const initialOutputFile = join(tempDir, "initial.ensrainbow"); + await convertCsvCommand({ + inputFile, + outputFile: initialOutputFile, + labelSetId: "test-filtering" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + }); + + // Ingest the initial file + const cli = createCLI({ exitProcess: false }); + await cli.parse([ + "ingest-ensrainbow", + "--input-file", + initialOutputFile, + "--data-dir", + dataDir, + ]); + + // Verify initial database + const db = await ENSRainbowDB.open(dataDir); + expect(await db.validate()).toBe(true); + const initialCount = await db.getPrecalculatedRainbowRecordCount(); + expect(initialCount).toBe(11); + await db.close(); + + // Now convert the same CSV file again, but with filtering enabled + await convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-filtering" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, // Use same version as initial + existingDbPath: dataDir, + }); + + // Verify the filtered output file was created + const outputStats = await stat(outputFile); + expect(outputStats.isFile()).toBe(true); + + // The filtered file should be smaller than the original since it excludes existing labels + const initialStats = await stat(initialOutputFile); + expect(outputStats.size).toBeLessThan(initialStats.size); + + // Verify that the filtered file contains fewer records + const filteredDataDir = join(tempDir, "db_filtered_result"); + await cli.parse([ + "ingest-ensrainbow", + "--input-file", + outputFile, + "--data-dir", + filteredDataDir, + ]); + + const filteredDb = await ENSRainbowDB.open(filteredDataDir); + expect(await filteredDb.validate()).toBe(true); + const filteredCount = await filteredDb.getPrecalculatedRainbowRecordCount(); + expect(filteredCount).toBe(0); // All labels should be filtered out since they already exist + await filteredDb.close(); + }); + + it("should filter out duplicate labels within the same conversion", async () => { + // Create a CSV file with duplicate labels + const csvContent = "label1\nlabel2\nlabel1\nlabel3\nlabel2\nlabel4"; + const inputFile = join(tempDir, "duplicates.csv"); + await writeFile(inputFile, csvContent); + + const outputFile = join(tempDir, "output_no_duplicates.ensrainbow"); + + // Convert CSV with duplicate filtering + await convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-duplicates" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + }); + + // Verify the output file was created + const stats = await stat(outputFile); + expect(stats.isFile()).toBe(true); + expect(stats.size).toBeGreaterThan(0); + + // Ingest and verify only unique labels were processed + const dataDir = join(tempDir, "db_no_duplicates"); + const cli = createCLI({ exitProcess: false }); + await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]); + + const db = await ENSRainbowDB.open(dataDir); + expect(await db.validate()).toBe(true); + + // Should have 4 unique labels (label1, label2, label3, label4) + const recordsCount = await db.getPrecalculatedRainbowRecordCount(); + expect(recordsCount).toBe(4); + + // Verify specific labels exist + expect( + (await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("label1"))))?.label, + ).toBe("label1"); + expect( + (await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("label2"))))?.label, + ).toBe("label2"); + expect( + (await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("label3"))))?.label, + ).toBe("label3"); + expect( + (await db.getVersionedRainbowRecord(labelHashToBytes(labelhash("label4"))))?.label, + ).toBe("label4"); + + await db.close(); + }); + + it("should handle non-existent database path gracefully", async () => { + const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv"); + const outputFile = join(tempDir, "output_no_db.ensrainbow"); + const nonExistentDbPath = join(tempDir, "non-existent-db"); + + // Should not throw error even with non-existent database path + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-no-db" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + existingDbPath: nonExistentDbPath, + }), + ).resolves.not.toThrow(); + + // Verify the output file was still created + const stats = await stat(outputFile); + expect(stats.isFile()).toBe(true); + expect(stats.size).toBeGreaterThan(0); + }); + + it("should work through CLI with existing database path", async () => { + const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv"); + const outputFile = join(tempDir, "cli_output_with_db.ensrainbow"); + const dataDir = join(tempDir, "cli_db_with_filtering"); + + // First create a database + const initialOutputFile = join(tempDir, "initial_cli.ensrainbow"); + const cli = createCLI({ exitProcess: false }); + + await cli.parse([ + "convert-csv", + "--input-file", + inputFile, + "--output-file", + initialOutputFile, + "--label-set-id", + "test-cli-filtering", + "--label-set-version", + "0", + ]); + + await cli.parse([ + "ingest-ensrainbow", + "--input-file", + initialOutputFile, + "--data-dir", + dataDir, + ]); + + // Now test CLI with existing database path + await cli.parse([ + "convert-csv", + "--input-file", + inputFile, + "--output-file", + outputFile, + "--label-set-id", + "test-cli-filtering", + "--label-set-version", + "1", + "--existing-db-path", + dataDir, + ]); + + // Verify file was created + const stats = await stat(outputFile); + expect(stats.isFile()).toBe(true); + expect(stats.size).toBeGreaterThan(0); + }); + }); + describe("Streaming performance", () => { it("should handle small CSV files efficiently", async () => { const inputFile = join(tempDir, "small_test.csv"); diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 14ae2d4b3..34f64d935 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -9,6 +9,7 @@ import { createReadStream, createWriteStream } from "fs"; import { type LabelHash, labelHashToBytes } from "@ensnode/ensnode-sdk"; import { parse } from "@fast-csv/parse"; import { labelhash } from "viem"; +import { ENSRainbowDB } from "../lib/database.js"; import { logger } from "../utils/logger.js"; import { CURRENT_ENSRAINBOW_FILE_FORMAT_VERSION, @@ -21,6 +22,7 @@ export interface ConvertCsvCommandOptions { labelSetId: string; labelSetVersion: number; progressInterval?: number; + existingDbPath?: string; // Path to existing ENSRainbow database to check for existing labels } // Configuration constants @@ -29,6 +31,8 @@ const DEFAULT_PROGRESS_INTERVAL = 10000; interface ConversionStats { totalLines: number; processedRecords: number; + filteredExistingLabels: number; + filteredDuplicates: number; startTime: Date; endTime?: Date; } @@ -74,19 +78,47 @@ function logSummary(stats: ConversionStats) { logger.info("=== Conversion Summary ==="); logger.info(`Total lines processed: ${stats.totalLines}`); logger.info(`Valid records: ${stats.processedRecords}`); + logger.info(`Filtered existing labels: ${stats.filteredExistingLabels}`); + logger.info(`Filtered duplicates: ${stats.filteredDuplicates}`); logger.info(`Duration: ${duration}ms`); } +/** + * Check if a labelhash exists in the ENSRainbow database + */ +async function checkLabelHashExists(db: ENSRainbowDB, labelHashBytes: Buffer): Promise { + try { + const record = await db.getVersionedRainbowRecord(labelHashBytes); + return record !== null; + } catch (error) { + // If there's an error checking, assume it doesn't exist + return false; + } +} + /** * Initialize conversion setup and logging */ -function initializeConversion(options: ConvertCsvCommandOptions) { +async function initializeConversion(options: ConvertCsvCommandOptions) { logger.info("Starting conversion from CSV to protobuf format..."); logger.info(`Input file: ${options.inputFile}`); logger.info(`Output file: ${options.outputFile}`); logger.info(`Label set id: ${options.labelSetId}`); logger.info(`Label set version: ${options.labelSetVersion}`); + // Open existing database if path is provided + let existingDb: ENSRainbowDB | null = null; + if (options.existingDbPath) { + try { + logger.info(`Opening existing database for filtering: ${options.existingDbPath}`); + existingDb = await ENSRainbowDB.open(options.existingDbPath); + logger.info("Successfully opened existing database for label filtering"); + } catch (error) { + logger.warn(`Failed to open existing database at ${options.existingDbPath}: ${error}`); + logger.warn("Proceeding without filtering existing labels"); + } + } + const { RainbowRecordType, RainbowRecordCollectionType } = createRainbowProtobufRoot(); const outputStream = setupWriteStream(options.outputFile); @@ -99,7 +131,7 @@ function initializeConversion(options: ConvertCsvCommandOptions) { logger.info("Reading and processing CSV file line by line with streaming..."); - return { RainbowRecordType, outputStream }; + return { RainbowRecordType, outputStream, existingDb }; } /** @@ -131,13 +163,16 @@ function createRainbowRecord(row: string[]): { labelhash: Buffer; label: string /** * Process a single CSV record */ -function processRecord( +async function processRecord( row: string[], expectedColumns: number, RainbowRecordType: any, outputStream: NodeJS.WritableStream, lineNumber: number, -): void { + existingDb: ENSRainbowDB | null, + writtenLabels: Set, + stats: ConversionStats, +): Promise { // Validate column count if (row.length !== expectedColumns) { throw new Error( @@ -146,10 +181,32 @@ function processRecord( } const rainbowRecord = createRainbowRecord(row); + const label = rainbowRecord.label; + const labelHashBytes = rainbowRecord.labelhash; + + // Check if labelhash already exists in the database + if (existingDb) { + const existsInDb = await checkLabelHashExists(existingDb, labelHashBytes); + if (existsInDb) { + stats.filteredExistingLabels++; + return false; // Skip this record + } + } + + // Check if label is a duplicate within this conversion + if (writtenLabels.has(label)) { + stats.filteredDuplicates++; + return false; // Skip this record + } + + // Add label to written set to track duplicates + writtenLabels.add(label); // Create protobuf message and write immediately const recordMessage = RainbowRecordType.fromObject(rainbowRecord); outputStream.write(Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish())); + + return true; // Record was processed } /** @@ -160,16 +217,19 @@ async function processCSVFile( RainbowRecordType: any, outputStream: NodeJS.WritableStream, progressInterval: number, + existingDb: ENSRainbowDB | null, + stats: ConversionStats, ): Promise<{ totalLines: number; processedRecords: number }> { return new Promise((resolve, reject) => { let expectedColumns: number | null = null; let lineNumber = 0; let processedRecords = 0; + const writtenLabels = new Set(); // Track labels written in this conversion const fileStream = createReadStream(inputFile, { encoding: "utf8" }); const csvStream = parse() - .on("data", (row: string[]) => { + .on("data", async (row: string[]) => { lineNumber++; try { @@ -179,12 +239,26 @@ async function processCSVFile( logger.info(`Detected ${expectedColumns} columns using fast-csv`); } - processRecord(row, expectedColumns, RainbowRecordType, outputStream, lineNumber); - processedRecords++; + const wasProcessed = await processRecord( + row, + expectedColumns, + RainbowRecordType, + outputStream, + lineNumber, + existingDb, + writtenLabels, + stats, + ); + + if (wasProcessed) { + processedRecords++; + } // Log progress for large files - if (processedRecords % progressInterval === 0) { - logger.info(`Processed ${processedRecords} records so far...`); + if (lineNumber % progressInterval === 0) { + logger.info( + `Processed ${lineNumber} lines, written ${processedRecords} records so far...`, + ); } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); @@ -219,11 +293,16 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom const stats: ConversionStats = { totalLines: 0, processedRecords: 0, + filteredExistingLabels: 0, + filteredDuplicates: 0, startTime: new Date(), }; + let existingDb: ENSRainbowDB | null = null; + try { - const { RainbowRecordType, outputStream } = initializeConversion(options); + const { RainbowRecordType, outputStream, existingDb: db } = await initializeConversion(options); + existingDb = db; const progressInterval = options.progressInterval ?? DEFAULT_PROGRESS_INTERVAL; @@ -233,6 +312,8 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom RainbowRecordType, outputStream, progressInterval, + existingDb, + stats, ); stats.totalLines = totalLines; @@ -248,5 +329,15 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom const errorMessage = error instanceof Error ? error.message : String(error); logger.error("❌ CSV conversion failed:", errorMessage); throw error; + } finally { + // Clean up database connection + if (existingDb) { + try { + await existingDb.close(); + logger.info("Closed existing database connection"); + } catch (error) { + logger.warn(`Failed to close existing database: ${error}`); + } + } } } diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx index f2c9c34cf..125e9916a 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -123,7 +123,8 @@ pnpm run convert-csv \ --output-file \ --label-set-id \ --label-set-version \ - [--progress-interval ] + [--progress-interval ] \ + [--existing-db-path ] ``` ### Required Parameters @@ -136,6 +137,7 @@ pnpm run convert-csv \ - `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`) - `--progress-interval`: Progress logging frequency (default: 10000 records) +- `--existing-db-path`: Path to existing ENSRainbow database to filter out existing labels ### CSV Format Support @@ -159,6 +161,42 @@ ens,0x5cee339e13375638553bdf5a6e36ba80fb9f6a4f0783680884d92b558aa471da The converter validates that provided labelhashes match the computed hash for each label. +### Label Filtering + +The CSV converter includes built-in filtering capabilities to prevent duplicate labels: + +#### Filtering Existing Labels +Use `--existing-db-path` to filter out labels that already exist in an existing ENSRainbow database: + +```bash +pnpm run convert-csv \ + --input-file new-labels.csv \ + --output-file incremental_1.ensrainbow \ + --label-set-id my-dataset \ + --label-set-version 1 \ + --existing-db-path data-my-dataset +``` + +This will: +- Check each label against the existing database +- Skip labels that already exist (avoiding duplicates) +- Only write new labels to the output file +- Log filtering statistics in the conversion summary + +#### Filtering Duplicate Labels Within CSV +The converter automatically filters duplicate labels within the same CSV file, keeping only the first occurrence of each label. + +#### Filtering Statistics +The conversion process logs detailed statistics: +``` +=== Conversion Summary === +Total lines processed: 1000 +Valid records: 850 +Filtered existing labels: 100 +Filtered duplicates: 50 +Duration: 150ms +``` + ### Example: Creating Test Dataset ```bash @@ -188,7 +226,9 @@ pnpm run convert-csv \ 2. **Streams** CSV parsing using fast-csv for memory efficiency 3. **Validates** column count and data format 4. **Computes** or validates labelhashes as needed -5. **Writes** protobuf messages with the same format as SQL conversion +5. **Filters** existing labels if `--existing-db-path` is provided +6. **Filters** duplicate labels within the same CSV file +7. **Writes** protobuf messages with the same format as SQL conversion ## Common Workflows @@ -256,7 +296,39 @@ pnpm run ingest-ensrainbow \ pnpm run serve --data-dir data-custom --port 3223 ``` -### Workflow 4: Using Custom Label Set Server +### Workflow 4: Creating Incremental Updates + +```bash +# 1. Create initial dataset +pnpm run convert-csv \ + --input-file initial-labels.csv \ + --output-file my-dataset_0.ensrainbow \ + --label-set-id my-dataset \ + --label-set-version 0 + +# 2. Ingest initial data +pnpm run ingest-ensrainbow \ + --input-file my-dataset_0.ensrainbow \ + --data-dir data-my-dataset + +# 3. Create incremental update (filtering existing labels) +pnpm run convert-csv \ + --input-file new-labels.csv \ + --output-file my-dataset_1.ensrainbow \ + --label-set-id my-dataset \ + --label-set-version 1 \ + --existing-db-path data-my-dataset + +# 4. Ingest incremental update +pnpm run ingest-ensrainbow \ + --input-file my-dataset_1.ensrainbow \ + --data-dir data-my-dataset + +# 5. Serve updated data +pnpm run serve --data-dir data-my-dataset --port 3223 +``` + +### Workflow 5: Using Custom Label Set Server ```bash # 1. Configure custom label set server @@ -318,6 +390,14 @@ pnpm run convert-csv \ --label-set-id my-dataset \ --label-set-version 0 +# For CSV data with filtering (if you have an existing database) +pnpm run convert-csv \ + --input-file my-labels.csv \ + --output-file my-dataset_1.ensrainbow \ + --label-set-id my-dataset \ + --label-set-version 1 \ + --existing-db-path data-my-dataset + # For SQL data pnpm run convert \ --input-file my-data.sql.gz \ From e2b9255224621dac9208bb6c6f2ca00b6fbaf75c Mon Sep 17 00:00:00 2001 From: djstrong Date: Mon, 24 Nov 2025 13:26:01 +0100 Subject: [PATCH 08/30] feat: enhance CSV conversion with Bloom filter and deduplication options - Added new command-line options for CSV conversion: `--silent`, `--disable-dedup`, `--cache-size`, `--use-bloom-filter`, and `--bloom-filter-size`. - Implemented a deduplication database using ClassicLevel with optional Bloom filter for faster processing. - Updated the conversion process to support deduplication and improved memory management. - Enhanced logging for large file processing and added tests for new deduplication features. --- apps/ensrainbow/package.json | 4 +- apps/ensrainbow/src/cli.ts | 47 ++- .../src/commands/convert-csv-command.test.ts | 39 +- .../src/commands/convert-csv-command.ts | 342 +++++++++++++++--- pnpm-lock.yaml | 51 +++ 5 files changed, 427 insertions(+), 56 deletions(-) diff --git a/apps/ensrainbow/package.json b/apps/ensrainbow/package.json index 046cb2e2e..341e0d440 100644 --- a/apps/ensrainbow/package.json +++ b/apps/ensrainbow/package.json @@ -19,7 +19,8 @@ "validate:lite": "tsx src/cli.ts validate --lite", "purge": "tsx src/cli.ts purge", "convert": "tsx src/cli.ts convert", - "test": "vitest", + "convert-csv": "NODE_OPTIONS='--expose-gc --max-old-space-size=4096' tsx src/cli.ts convert-csv", + "test": "NODE_OPTIONS='--max-old-space-size=8192' vitest", "test:coverage": "vitest --coverage", "lint": "biome check --write .", "lint:ci": "biome ci", @@ -32,6 +33,7 @@ "@ensnode/ensrainbow-sdk": "workspace:*", "@ensnode/ensnode-sdk": "workspace:*", "@hono/node-server": "^1.4.1", + "bloom-filters": "^3.0.4", "classic-level": "^1.4.1", "hono": "catalog:", "pino": "catalog:", diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index d9d38c4f9..6e6bb4f32 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -69,6 +69,11 @@ interface ConvertCsvArgs { "label-set-version": LabelSetVersion; "progress-interval"?: number; "existing-db-path"?: string; + "silent"?: boolean; + "disable-dedup"?: boolean; + "cache-size"?: number; + "use-bloom-filter"?: boolean; + "bloom-filter-size"?: number; } export interface CLIOptions { @@ -261,10 +266,35 @@ export function createCLI(options: CLIOptions = {}) { description: "Number of records to process before logging progress", default: 10000, }) - .option("existing-db-path", { - type: "string", - description: "Path to existing ENSRainbow database to filter out existing labels", - }); + .option("existing-db-path", { + type: "string", + description: "Path to existing ENSRainbow database to filter out existing labels", + }) + .option("silent", { + type: "boolean", + description: "Disable progress bar (useful for scripts)", + default: false, + }) + .option("disable-dedup", { + type: "boolean", + description: "Disable deduplication within CSV file (faster but may create duplicates)", + default: false, + }) + .option("cache-size", { + type: "number", + description: "Cache size for deduplication (default: 5000)", + default: 5000, + }) + .option("use-bloom-filter", { + type: "boolean", + description: "Use Bloom filter for faster deduplication (default: false)", + default: false, + }) + .option("bloom-filter-size", { + type: "number", + description: "Expected number of items for Bloom filter (default: 10000000)", + default: 10000000, + }); }, async (argv: ArgumentsCamelCase) => { await convertCsvCommand({ @@ -272,8 +302,13 @@ export function createCLI(options: CLIOptions = {}) { outputFile: argv["output-file"], labelSetId: argv["label-set-id"], labelSetVersion: argv["label-set-version"], - progressInterval: argv["progress-interval"], - existingDbPath: argv["existing-db-path"], + progressInterval: argv["progress-interval"], + existingDbPath: argv["existing-db-path"], + silent: argv["silent"], + noDedup: argv["disable-dedup"], + cacheSize: argv["cache-size"], + useBloomFilter: argv["use-bloom-filter"], + bloomFilterSize: argv["bloom-filter-size"], }); }, ) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index 9e2569ab2..c6ddadb03 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -38,6 +38,7 @@ describe("convert-csv-command", () => { outputFile, labelSetId: "test-csv-one-col" as LabelSetId, labelSetVersion: 0 as LabelSetVersion, + silent: true, }); // Verify the output file was created @@ -71,6 +72,7 @@ describe("convert-csv-command", () => { outputFile, labelSetId: "test-csv-two-col" as LabelSetId, labelSetVersion: 0 as LabelSetVersion, + silent: true, }); // Verify the output file was created @@ -119,6 +121,7 @@ describe("convert-csv-command", () => { outputFile, labelSetId: "test-csv-special" as LabelSetId, labelSetVersion: 0 as LabelSetVersion, + silent: true, }); // Verify output file was created @@ -229,6 +232,7 @@ describe("convert-csv-command", () => { outputFile: initialOutputFile, labelSetId: "test-filtering" as LabelSetId, labelSetVersion: 0 as LabelSetVersion, + silent: true, }); // Ingest the initial file @@ -255,6 +259,7 @@ describe("convert-csv-command", () => { labelSetId: "test-filtering" as LabelSetId, labelSetVersion: 0 as LabelSetVersion, // Use same version as initial existingDbPath: dataDir, + silent: true, }); // Verify the filtered output file was created @@ -296,6 +301,7 @@ describe("convert-csv-command", () => { outputFile, labelSetId: "test-duplicates" as LabelSetId, labelSetVersion: 0 as LabelSetVersion, + silent: true, }); // Verify the output file was created @@ -400,10 +406,10 @@ describe("convert-csv-command", () => { // Verify file was created const stats = await stat(outputFile); - expect(stats.isFile()).toBe(true); - expect(stats.size).toBeGreaterThan(0); - }); + expect(stats.isFile()).toBe(true); + expect(stats.size).toBeGreaterThan(0); }); +}); describe("Streaming performance", () => { it("should handle small CSV files efficiently", async () => { @@ -426,6 +432,7 @@ describe("convert-csv-command", () => { outputFile, labelSetId: "test-small" as LabelSetId, labelSetVersion: 0 as LabelSetVersion, + silent: true, }); const conversionTime = Date.now() - startTime; @@ -453,5 +460,31 @@ describe("convert-csv-command", () => { const dbStats = await stat(dataDir); expect(dbStats.isDirectory()).toBe(true); }); + + it("should handle CSV files with many unique labels", async () => { + const inputFile = join(tempDir, "many_labels.csv"); + const outputFile = join(tempDir, "output_many_labels.ensrainbow"); + + // Create a CSV with 50,000 unique labels (tests deduplication with increased memory limit) + const records = []; + for (let i = 0; i < 50_000; i++) { + records.push(`label${i}`); + } + await writeFile(inputFile, records.join("\n")); + + // This should work without memory issues + await convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-many-labels" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + silent: true, + }); + + // Verify file was created + const stats = await stat(outputFile); + expect(stats.isFile()).toBe(true); + expect(stats.size).toBeGreaterThan(0); + }, 60000); // 60 second timeout for large file test }); }); diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 34f64d935..0e0c8ac0e 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -5,10 +5,15 @@ * Supports 1-column (label only) and 2-column (label,labelhash) formats */ -import { createReadStream, createWriteStream } from "fs"; +import { createReadStream, createWriteStream, statSync } from "fs"; +import { rmSync } from "fs"; +import { join } from "path"; import { type LabelHash, labelHashToBytes } from "@ensnode/ensnode-sdk"; import { parse } from "@fast-csv/parse"; import { labelhash } from "viem"; +import { ClassicLevel } from "classic-level"; +import ProgressBar from "progress"; +import bloomFilters from "bloom-filters"; import { ENSRainbowDB } from "../lib/database.js"; import { logger } from "../utils/logger.js"; import { @@ -16,6 +21,129 @@ import { createRainbowProtobufRoot, } from "../utils/protobuf-schema.js"; +/** + * Simple deduplication database using ClassicLevel directly + */ +class DeduplicationDB { + private pendingWrites: Map = new Map(); + private cache: Map = new Map(); + private cacheSize: number; + private bloomFilter: typeof bloomFilters.BloomFilter | null = null; + + constructor(private db: ClassicLevel, cacheSize: number = 10000, useBloomFilter: boolean = false, expectedItems: number = 10000000) { + this.cacheSize = cacheSize; + + if (useBloomFilter) { + // Create Bloom filter with 0.1% false positive rate + this.bloomFilter = bloomFilters.BloomFilter.create(expectedItems, 0.01); + logger.info(`Created Bloom filter for ${expectedItems} items (~${(this.bloomFilter.size / 8 / 1024 / 1024).toFixed(2)} MB)`); + } + } + + async has(key: string): Promise { + // Check cache first + if (this.cache.has(key)) { + return this.cache.get(key)!; + } + + // Check pending writes + if (this.pendingWrites.has(key)) { + this.cache.set(key, true); + return true; + } + + // Use Bloom filter if available + if (this.bloomFilter) { + // If Bloom filter says "not present", we can skip LevelDB check + if (!this.bloomFilter.has(key)) { + this.cache.set(key, false); + return false; + } + // Bloom filter says "maybe present" - need to check LevelDB + } + + // Check database + try { + await this.db.get(key); + this.cache.set(key, true); + return true; + } catch (error) { + this.cache.set(key, false); + return false; + } + } + + async add(key: string, value: string): Promise { + this.pendingWrites.set(key, value); + this.cache.set(key, true); // Cache the fact that this key exists + + // Add to Bloom filter if available + if (this.bloomFilter) { + this.bloomFilter.add(key); + } + + // Check cache size periodically (not on every add) + this.evictCacheIfNeeded(); + + // Flush to database periodically (smaller batch to reduce memory usage) + if (this.pendingWrites.size >= 5000) { + await this.flush(); + } + } + + private evictCacheIfNeeded(): void { + // Limit cache size - only evict when significantly exceeded + if (this.cache.size > this.cacheSize * 1.2) { + // Remove oldest 20% of entries + let toRemove = Math.floor(this.cacheSize * 0.2); + for (const key of this.cache.keys()) { + if (toRemove-- <= 0) break; + this.cache.delete(key); + } + } + } + + async flush(): Promise { + if (this.pendingWrites.size === 0) return; + + const batch = this.db.batch(); + for (const [key, value] of this.pendingWrites) { + batch.put(key, value); + } + await batch.write(); + this.pendingWrites.clear(); + + // Hint to garbage collector after large batch + if (global.gc) { + global.gc(); + } + } + + async close(): Promise { + await this.flush(); + await this.db.close(); + } +} + + +/** + * Sets up a simple progress bar that shows speed without total count. + */ +function setupProgressBar(): ProgressBar { + return new ProgressBar( + "Processing CSV [:bar] :current lines - :rate lines/sec", + { + complete: "=", + incomplete: " ", + width: 40, + total: 200000000, // Very large total for big files + }, + ); +} + +/** + * Options for CSV conversion command + */ export interface ConvertCsvCommandOptions { inputFile: string; outputFile: string; @@ -23,6 +151,11 @@ export interface ConvertCsvCommandOptions { labelSetVersion: number; progressInterval?: number; existingDbPath?: string; // Path to existing ENSRainbow database to check for existing labels + silent?: boolean; // Disable progress bar for tests + noDedup?: boolean; // Disable deduplication within CSV file + cacheSize?: number; // Cache size for deduplication (default: 10000) + useBloomFilter?: boolean; // Use Bloom filter for faster deduplication (default: false) + bloomFilterSize?: number; // Expected number of items for Bloom filter (default: 10000000) } // Configuration constants @@ -106,6 +239,20 @@ async function initializeConversion(options: ConvertCsvCommandOptions) { logger.info(`Label set id: ${options.labelSetId}`); logger.info(`Label set version: ${options.labelSetVersion}`); + // Check file size and warn for very large files + try { + const stats = statSync(options.inputFile); + const fileSizeMB = (stats.size / (1024 * 1024)).toFixed(2); + logger.info(`Input file size: ${fileSizeMB} MB`); + + if (stats.size > 1024 * 1024 * 1024) { // > 1GB + logger.warn("⚠️ Processing a very large file. This may take significant time and memory."); + logger.warn("💡 Consider using --existing-db-path to filter out existing labels for better performance."); + } + } catch (error) { + logger.warn(`Could not determine file size: ${error}`); + } + // Open existing database if path is provided let existingDb: ENSRainbowDB | null = null; if (options.existingDbPath) { @@ -143,7 +290,6 @@ function createRainbowRecord(row: string[]): { labelhash: Buffer; label: string if (row.length === 1) { // Single column: compute labelhash using labelhash function const labelHashBytes = labelHashToBytes(labelhash(label)); - console.log(label); return { labelhash: Buffer.from(labelHashBytes), label: label, @@ -161,7 +307,7 @@ function createRainbowRecord(row: string[]): { labelhash: Buffer; label: string } /** - * Process a single CSV record + * Process a single CSV record with LevelDB-based deduplication */ async function processRecord( row: string[], @@ -170,7 +316,7 @@ async function processRecord( outputStream: NodeJS.WritableStream, lineNumber: number, existingDb: ENSRainbowDB | null, - writtenLabels: Set, + dedupDb: DeduplicationDB | null, stats: ConversionStats, ): Promise { // Validate column count @@ -184,7 +330,7 @@ async function processRecord( const label = rainbowRecord.label; const labelHashBytes = rainbowRecord.labelhash; - // Check if labelhash already exists in the database + // Check if labelhash already exists in the existing database if (existingDb) { const existsInDb = await checkLabelHashExists(existingDb, labelHashBytes); if (existsInDb) { @@ -193,14 +339,17 @@ async function processRecord( } } - // Check if label is a duplicate within this conversion - if (writtenLabels.has(label)) { - stats.filteredDuplicates++; - return false; // Skip this record - } + // Check if label is a duplicate within this conversion using LevelDB (if enabled) + if (dedupDb) { + const existsInDedupDb = await dedupDb.has(label); + if (existsInDedupDb) { + stats.filteredDuplicates++; + return false; // Skip this record + } - // Add label to written set to track duplicates - writtenLabels.add(label); + // Add label to deduplication database + await dedupDb.add(label, ""); + } // Create protobuf message and write immediately const recordMessage = RainbowRecordType.fromObject(rainbowRecord); @@ -218,49 +367,89 @@ async function processCSVFile( outputStream: NodeJS.WritableStream, progressInterval: number, existingDb: ENSRainbowDB | null, + dedupDb: DeduplicationDB | null, stats: ConversionStats, + progressBar: ProgressBar | null, ): Promise<{ totalLines: number; processedRecords: number }> { - return new Promise((resolve, reject) => { - let expectedColumns: number | null = null; - let lineNumber = 0; - let processedRecords = 0; - const writtenLabels = new Set(); // Track labels written in this conversion + let expectedColumns: number | null = null; + let lineNumber = 0; + let processedRecords = 0; + let lastLoggedLine = 0; // Track last logged line to avoid duplicate logs + const startTime = Date.now(); // Track start time for overall processing + let lastLogTime = Date.now(); // Track time of last log for chunk timing + + // LevelDB-based deduplication: Uses temporary database to avoid RAM limits - const fileStream = createReadStream(inputFile, { encoding: "utf8" }); + const fileStream = createReadStream(inputFile, { encoding: "utf8" }); + + return new Promise((resolve, reject) => { + let pendingCount = 0; + const MAX_PENDING = 100; // Smaller limit to reduce memory const csvStream = parse() - .on("data", async (row: string[]) => { + .on("data", (row: string[]) => { lineNumber++; - try { - // For the first row, detect column count - if (expectedColumns === null) { - expectedColumns = row.length; - logger.info(`Detected ${expectedColumns} columns using fast-csv`); - } + // For the first row, detect column count + if (expectedColumns === null) { + expectedColumns = row.length; + logger.info(`Detected ${expectedColumns} columns using fast-csv`); + } - const wasProcessed = await processRecord( - row, - expectedColumns, - RainbowRecordType, - outputStream, - lineNumber, - existingDb, - writtenLabels, - stats, + // Log progress synchronously when line is read (not in async callback) + // This ensures logs appear at the correct intervals + if (lineNumber % progressInterval === 0 && lineNumber !== lastLoggedLine) { + const currentTime = Date.now(); + const chunkTime = currentTime - lastLogTime; // Time for this 10k chunk + const totalElapsed = currentTime - startTime; // Total time since start + const chunkTimeSeconds = (chunkTime / 1000).toFixed(2); + const totalTimeSeconds = (totalElapsed / 1000).toFixed(2); + const linesPerSecond = ((progressInterval / chunkTime) * 1000).toFixed(0); + + lastLoggedLine = lineNumber; + lastLogTime = currentTime; + + // Note: processedRecords may be slightly behind due to async processing + logger.info( + `Processed ${lineNumber} lines, written ${processedRecords} records | ` + + `Chunk: ${chunkTimeSeconds}s (${linesPerSecond} lines/sec) | ` + + `Total: ${totalTimeSeconds}s` ); + } + + // Backpressure: pause if too many pending + if (pendingCount >= MAX_PENDING) { + csvStream.pause(); + } + pendingCount++; + processRecord( + row, + expectedColumns, + RainbowRecordType, + outputStream, + lineNumber, + existingDb, + dedupDb, + stats, + ).then((wasProcessed) => { if (wasProcessed) { processedRecords++; } - - // Log progress for large files - if (lineNumber % progressInterval === 0) { - logger.info( - `Processed ${lineNumber} lines, written ${processedRecords} records so far...`, - ); + + // Update progress bar every 1000 lines + if (lineNumber % 1000 === 0 && progressBar) { + progressBar.tick(1000); + progressBar.curr = lineNumber; + } + + pendingCount--; + + // Resume when under threshold + if (csvStream.isPaused() && pendingCount < MAX_PENDING / 2) { + csvStream.resume(); } - } catch (error) { + }).catch((error) => { const errorMessage = error instanceof Error ? error.message : String(error); csvStream.destroy(); fileStream.destroy(); @@ -269,12 +458,18 @@ async function processCSVFile( `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`, ), ); - } + }); }) .on("error", (error: Error) => { reject(new Error(`CSV parsing error: ${error.message}`)); }) - .on("end", () => { + .on("end", async () => { + // Wait for all pending to complete + while (pendingCount > 0) { + await new Promise(resolve => setTimeout(resolve, 10)); + } + const dedupStatus = dedupDb ? "LevelDB deduplication completed" : "Deduplication disabled"; + logger.info(dedupStatus); resolve({ totalLines: lineNumber, processedRecords }); }); @@ -299,13 +494,38 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom }; let existingDb: ENSRainbowDB | null = null; + let dedupDb: DeduplicationDB | null = null; + let tempDedupDir: string | null = null; try { const { RainbowRecordType, outputStream, existingDb: db } = await initializeConversion(options); existingDb = db; + // Create temporary deduplication database (if not disabled) + if (!options.noDedup) { + tempDedupDir = join(process.cwd(), 'temp-dedup-' + Date.now()); + logger.info(`Creating temporary deduplication database at: ${tempDedupDir}`); + const tempDb = new ClassicLevel(tempDedupDir, { + keyEncoding: 'utf8', + valueEncoding: 'utf8', + createIfMissing: true, + }); + await tempDb.open(); + dedupDb = new DeduplicationDB( + tempDb, + options.cacheSize ?? 10000, + options.useBloomFilter ?? false, + options.bloomFilterSize ?? 10000000 + ); + } else { + logger.info("Deduplication disabled - processing all records"); + } + const progressInterval = options.progressInterval ?? DEFAULT_PROGRESS_INTERVAL; + // Set up progress bar (only if not silent) + const progressBar = options.silent ? null : setupProgressBar(); + // Process the CSV file const { totalLines, processedRecords } = await processCSVFile( options.inputFile, @@ -313,11 +533,21 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom outputStream, progressInterval, existingDb, + dedupDb, stats, + progressBar, ); - stats.totalLines = totalLines; - stats.processedRecords = processedRecords; + stats.totalLines = totalLines; + stats.processedRecords = processedRecords; + + // Log final progress for large files + if (totalLines > 10_000) { + const dedupStatus = options.noDedup ? "dedup disabled" : "LevelDB dedup active"; + logger.info( + `✅ Completed processing ${totalLines.toLocaleString()} lines, wrote ${processedRecords.toLocaleString()} records (${dedupStatus})`, + ); + } // Close output stream outputStream.end(); @@ -330,7 +560,17 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom logger.error("❌ CSV conversion failed:", errorMessage); throw error; } finally { - // Clean up database connection + // Clean up deduplication database + if (dedupDb) { + try { + await dedupDb.close(); + logger.info("Closed deduplication database"); + } catch (error) { + logger.warn(`Failed to close deduplication database: ${error}`); + } + } + + // Clean up existing database connection if (existingDb) { try { await existingDb.close(); @@ -339,5 +579,15 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom logger.warn(`Failed to close existing database: ${error}`); } } + + // Remove temporary deduplication database directory + if (tempDedupDir) { + try { + rmSync(tempDedupDir, { recursive: true, force: true }); + logger.info(`Removed temporary deduplication database: ${tempDedupDir}`); + } catch (error) { + logger.warn(`Failed to remove temporary deduplication database: ${error}`); + } + } } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 3dea391e0..3ccf052d8 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -462,6 +462,9 @@ importers: '@hono/node-server': specifier: ^1.4.1 version: 1.19.5(hono@4.10.3) + bloom-filters: + specifier: ^3.0.4 + version: 3.0.4 classic-level: specifier: ^1.4.1 version: 1.4.1 @@ -3435,6 +3438,9 @@ packages: '@types/sax@1.2.7': resolution: {integrity: sha512-rO73L89PJxeYM3s3pPPjiPgVVcymqU490g0YO5n5By0k2Erzj6tay/4lr1CHAAU4JyOWd1rpQ8bCf6cZfHU96A==} + '@types/seedrandom@3.0.8': + resolution: {integrity: sha512-TY1eezMU2zH2ozQoAFAQFOPpvP15g+ZgSfTZt31AUUH/Rxtnz3H+A/Sv1Snw2/amp//omibc+AEkTaA8KUeOLQ==} + '@types/tar@6.1.13': resolution: {integrity: sha512-IznnlmU5f4WcGTh2ltRu/Ijpmk8wiWXfF0VA4s+HPjHZgvFggk1YaIkbo5krX/zUCzWF8N/l4+W/LNxnvAJ8nw==} @@ -3783,6 +3789,10 @@ packages: base-64@1.0.0: resolution: {integrity: sha512-kwDPIFCGx0NZHog36dj+tHiwP4QMzsZ3AgMViUBKI0+V5n4U0ufTCUMhnQ04diaRI8EX/QcPfql7zlhZ7j4zgg==} + base64-arraybuffer@1.0.2: + resolution: {integrity: sha512-I3yl4r9QB5ZRY3XuJVEPfc2XhZO6YweFPI+UovAzn+8/hb3oJ6lnysaFcjVpkCPfVWFUDvoZ8kmVDP7WyRtYtQ==} + engines: {node: '>= 0.6.0'} + base64-js@1.5.1: resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} @@ -3813,6 +3823,10 @@ packages: bintrees@1.0.2: resolution: {integrity: sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==} + bloom-filters@3.0.4: + resolution: {integrity: sha512-BdnPWo2OpYhlvuP2fRzJBdioMCkm7Zp0HCf8NJgF5Mbyqy7VQ/CnTiVWMMyq4EZCBHwj0Kq6098gW2/3RsZsrA==} + engines: {node: '>=12'} + boolbase@1.0.0: resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==} @@ -4167,6 +4181,9 @@ packages: csstype@3.2.3: resolution: {integrity: sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==} + cuint@0.2.2: + resolution: {integrity: sha512-d4ZVpCW31eWwCMe1YT3ur7mUDnTXbgwyzaL320DrcRT45rfjYxkt5QWLrmOJ+/UEAI2+fQgKe/fCjR8l4TpRgw==} + cytoscape-cose-bilkent@4.1.0: resolution: {integrity: sha512-wgQlVIUJF13Quxiv5e1gstZ08rnZj2XaLHGoFMYXz7SkNfCDOOteKBE6SYRfA9WxxI/iBc3ajfDoc6hb/MRAHQ==} peerDependencies: @@ -6537,6 +6554,9 @@ packages: recma-stringify@1.0.0: resolution: {integrity: sha512-cjwII1MdIIVloKvC9ErQ+OgAtwHBmcZ0Bg4ciz78FtbT8In39aAYbaA7zvxQ61xVMSPE8WxhLwLbhif4Js2C+g==} + reflect-metadata@0.1.14: + resolution: {integrity: sha512-ZhYeb6nRaXCfhnndflDK8qI6ZQ/YcWZCISRAWICW9XYqMUwjZM9Z0DveWX/ABN01oxSHwVxKQmxeYZSsm0jh5A==} + regex-recursion@6.0.2: resolution: {integrity: sha512-0YCaSCq2VRIebiaUviZNs0cBz1kg5kVS2UKUfNIx8YVs1cN3AV7NTctO5FOKBA+UT2BPJIWZauYHPqJODG50cg==} @@ -6708,6 +6728,9 @@ packages: secure-json-parse@4.1.0: resolution: {integrity: sha512-l4KnYfEyqYJxDwlNVyRfO2E4NTHfMKAWdUuA8J0yve2Dz/E/PdBepY03RvyJpssIpRFwJoCD55wA+mEDs6ByWA==} + seedrandom@3.0.5: + resolution: {integrity: sha512-8OwmbklUNzwezjGInmZ+2clQmExQPvomqjL7LFqOYqtmuxRgQYqOD3mHaU+MvZn5FLUeVxVfQjwLZW/n/JFuqg==} + semver-compare@1.0.0: resolution: {integrity: sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow==} @@ -7767,6 +7790,9 @@ packages: xxhash-wasm@1.1.0: resolution: {integrity: sha512-147y/6YNh+tlp6nd/2pWq38i9h6mz/EuQ6njIrmW8D1BS5nCqs0P6DG+m6zTGnNz5I+uhZ0SHxBs9BsPrwcKDA==} + xxhashjs@0.2.2: + resolution: {integrity: sha512-AkTuIuVTET12tpsVIQo+ZU6f/qDmKuRUcjaqR+OIvm+aCBsZ95i7UVY5WJ9TMsSaZ0DA2WxoZ4acu0sPH+OKAw==} + y18n@5.0.8: resolution: {integrity: sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==} engines: {node: '>=10'} @@ -10818,6 +10844,8 @@ snapshots: dependencies: '@types/node': 22.18.13 + '@types/seedrandom@3.0.8': {} + '@types/tar@6.1.13': dependencies: '@types/node': 22.18.13 @@ -11294,6 +11322,8 @@ snapshots: base-64@1.0.0: {} + base64-arraybuffer@1.0.2: {} + base64-js@1.5.1: {} baseline-browser-mapping@2.8.21: {} @@ -11320,6 +11350,17 @@ snapshots: bintrees@1.0.2: {} + bloom-filters@3.0.4: + dependencies: + '@types/seedrandom': 3.0.8 + base64-arraybuffer: 1.0.2 + is-buffer: 2.0.5 + lodash: 4.17.21 + long: 5.3.2 + reflect-metadata: 0.1.14 + seedrandom: 3.0.5 + xxhashjs: 0.2.2 + boolbase@1.0.0: {} boring-avatars@1.11.2: {} @@ -11695,6 +11736,8 @@ snapshots: csstype@3.2.3: {} + cuint@0.2.2: {} + cytoscape-cose-bilkent@4.1.0(cytoscape@3.33.1): dependencies: cose-base: 1.0.3 @@ -14442,6 +14485,8 @@ snapshots: unified: 11.0.5 vfile: 6.0.3 + reflect-metadata@0.1.14: {} + regex-recursion@6.0.2: dependencies: regex-utilities: 2.3.0 @@ -14708,6 +14753,8 @@ snapshots: secure-json-parse@4.1.0: {} + seedrandom@3.0.5: {} + semver-compare@1.0.0: {} semver@6.3.1: {} @@ -15814,6 +15861,10 @@ snapshots: xxhash-wasm@1.1.0: {} + xxhashjs@0.2.2: + dependencies: + cuint: 0.2.2 + y18n@5.0.8: {} yallist@3.1.1: {} From 2c94d417a9d8fc631c2035e9c245a49410fe727b Mon Sep 17 00:00:00 2001 From: djstrong Date: Mon, 24 Nov 2025 13:27:31 +0100 Subject: [PATCH 09/30] refactor: simplify command options in package.json --- apps/ensrainbow/package.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/ensrainbow/package.json b/apps/ensrainbow/package.json index 341e0d440..7379e93d3 100644 --- a/apps/ensrainbow/package.json +++ b/apps/ensrainbow/package.json @@ -19,8 +19,8 @@ "validate:lite": "tsx src/cli.ts validate --lite", "purge": "tsx src/cli.ts purge", "convert": "tsx src/cli.ts convert", - "convert-csv": "NODE_OPTIONS='--expose-gc --max-old-space-size=4096' tsx src/cli.ts convert-csv", - "test": "NODE_OPTIONS='--max-old-space-size=8192' vitest", + "convert-csv": "tsx src/cli.ts convert-csv", + "test": "vitest", "test:coverage": "vitest --coverage", "lint": "biome check --write .", "lint:ci": "biome ci", From 721a50d4507261fcf4efb93c347c430e2c364a1d Mon Sep 17 00:00:00 2001 From: djstrong Date: Thu, 11 Dec 2025 20:56:40 +0100 Subject: [PATCH 10/30] refactor: improve memory management and logging in CSV conversion - Added a function to estimate memory usage of Maps for better tracking. - Reduced default cache size in DeduplicationDB from 10000 to 1000. - Enhanced backpressure handling during CSV writing to prevent memory overflow. - Updated logging to include output backpressure events and improved performance for large files. - Streamlined the CSV processing to operate in a completely sequential manner. --- .../src/commands/convert-csv-command.ts | 245 +++++++++--------- 1 file changed, 128 insertions(+), 117 deletions(-) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 0e0c8ac0e..db7478664 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -21,17 +21,27 @@ import { createRainbowProtobufRoot, } from "../utils/protobuf-schema.js"; +/** + * Estimate memory usage of a Map (rough approximation) + */ +function estimateMapMemory(map: Map): number { + let total = 0; + for (const [key, value] of map) { + // Rough estimate: key size + value size + Map overhead (48 bytes per entry) + total += (key.length * 2) + (typeof value === 'string' ? value.length * 2 : 8) + 48; + } + return total; +} + /** * Simple deduplication database using ClassicLevel directly */ class DeduplicationDB { private pendingWrites: Map = new Map(); - private cache: Map = new Map(); - private cacheSize: number; private bloomFilter: typeof bloomFilters.BloomFilter | null = null; - constructor(private db: ClassicLevel, cacheSize: number = 10000, useBloomFilter: boolean = false, expectedItems: number = 10000000) { - this.cacheSize = cacheSize; + constructor(private db: ClassicLevel, cacheSize: number = 1000, useBloomFilter: boolean = false, expectedItems: number = 10000000) { + // No in-memory cache - LevelDB has its own internal cache if (useBloomFilter) { // Create Bloom filter with 0.1% false positive rate @@ -41,68 +51,41 @@ class DeduplicationDB { } async has(key: string): Promise { - // Check cache first - if (this.cache.has(key)) { - return this.cache.get(key)!; - } - - // Check pending writes + // Check pending writes first (not yet flushed to DB) if (this.pendingWrites.has(key)) { - this.cache.set(key, true); return true; } - // Use Bloom filter if available + // Use Bloom filter if available (skip expensive DB lookup) if (this.bloomFilter) { - // If Bloom filter says "not present", we can skip LevelDB check if (!this.bloomFilter.has(key)) { - this.cache.set(key, false); return false; } - // Bloom filter says "maybe present" - need to check LevelDB } - // Check database + // Check database (LevelDB has its own internal cache) try { await this.db.get(key); - this.cache.set(key, true); return true; } catch (error) { - this.cache.set(key, false); return false; } } async add(key: string, value: string): Promise { this.pendingWrites.set(key, value); - this.cache.set(key, true); // Cache the fact that this key exists // Add to Bloom filter if available if (this.bloomFilter) { this.bloomFilter.add(key); } - // Check cache size periodically (not on every add) - this.evictCacheIfNeeded(); - - // Flush to database periodically (smaller batch to reduce memory usage) - if (this.pendingWrites.size >= 5000) { + // Flush frequently to keep pendingWrites small + if (this.pendingWrites.size >= 1000) { await this.flush(); } } - private evictCacheIfNeeded(): void { - // Limit cache size - only evict when significantly exceeded - if (this.cache.size > this.cacheSize * 1.2) { - // Remove oldest 20% of entries - let toRemove = Math.floor(this.cacheSize * 0.2); - for (const key of this.cache.keys()) { - if (toRemove-- <= 0) break; - this.cache.delete(key); - } - } - } - async flush(): Promise { if (this.pendingWrites.size === 0) return; @@ -123,6 +106,15 @@ class DeduplicationDB { await this.flush(); await this.db.close(); } + + getMemoryStats(): { pendingWrites: number; cache: number; pendingWritesMB: number; cacheMB: number } { + return { + pendingWrites: this.pendingWrites.size, + cache: 0, // Cache disabled - using LevelDB's internal cache + pendingWritesMB: estimateMapMemory(this.pendingWrites) / 1024 / 1024, + cacheMB: 0, + }; + } } @@ -159,13 +151,14 @@ export interface ConvertCsvCommandOptions { } // Configuration constants -const DEFAULT_PROGRESS_INTERVAL = 10000; +const DEFAULT_PROGRESS_INTERVAL = 50000; // Increased from 10k to 50k to reduce logging load interface ConversionStats { totalLines: number; processedRecords: number; filteredExistingLabels: number; filteredDuplicates: number; + outputBackpressureEvents: number; startTime: Date; endTime?: Date; } @@ -174,8 +167,12 @@ interface ConversionStats { * Setup output stream for writing protobuf */ function setupWriteStream(outputFile: string) { - // For now, just write directly to file without gzip compression - return createWriteStream(outputFile); + // Use very small highWaterMark (16KB) to trigger backpressure early and frequently + // This prevents unbounded buffer growth when writes are faster than disk I/O + // Smaller buffer = more frequent backpressure = better memory control + return createWriteStream(outputFile, { + highWaterMark: 16 * 1024, // 16KB buffer - very small to catch backpressure early + }); } /** @@ -213,6 +210,7 @@ function logSummary(stats: ConversionStats) { logger.info(`Valid records: ${stats.processedRecords}`); logger.info(`Filtered existing labels: ${stats.filteredExistingLabels}`); logger.info(`Filtered duplicates: ${stats.filteredDuplicates}`); + logger.info(`Output backpressure events: ${stats.outputBackpressureEvents}`); logger.info(`Duration: ${duration}ms`); } @@ -246,8 +244,8 @@ async function initializeConversion(options: ConvertCsvCommandOptions) { logger.info(`Input file size: ${fileSizeMB} MB`); if (stats.size > 1024 * 1024 * 1024) { // > 1GB - logger.warn("⚠️ Processing a very large file. This may take significant time and memory."); - logger.warn("💡 Consider using --existing-db-path to filter out existing labels for better performance."); + logger.warn("⚠️ Processing a very large file - using SEQUENTIAL mode."); + logger.warn("💡 Use --existing-db-path to filter existing labels and speed up processing."); } } catch (error) { logger.warn(`Could not determine file size: ${error}`); @@ -351,15 +349,27 @@ async function processRecord( await dedupDb.add(label, ""); } - // Create protobuf message and write immediately + // Create protobuf message and write with backpressure handling const recordMessage = RainbowRecordType.fromObject(rainbowRecord); - outputStream.write(Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish())); + const buffer = Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish()); + + // Check if write returns false (buffer full) - if so, wait for drain + const canContinue = outputStream.write(buffer); + if (!canContinue) { + // Buffer is full - signal backpressure + stats.outputBackpressureEvents++; + // Wait for drain event before continuing + // Note: The CSV stream should be paused by the caller when backpressure is detected + await new Promise((resolve) => { + outputStream.once('drain', resolve); + }); + } return true; // Record was processed } /** - * Process the entire CSV file using fast-csv + * Process the entire CSV file - COMPLETELY SEQUENTIAL (one row at a time) */ async function processCSVFile( inputFile: string, @@ -374,102 +384,97 @@ async function processCSVFile( let expectedColumns: number | null = null; let lineNumber = 0; let processedRecords = 0; - let lastLoggedLine = 0; // Track last logged line to avoid duplicate logs - const startTime = Date.now(); // Track start time for overall processing - let lastLogTime = Date.now(); // Track time of last log for chunk timing - - // LevelDB-based deduplication: Uses temporary database to avoid RAM limits + let lastLoggedLine = 0; + const startTime = Date.now(); + let lastLogTime = Date.now(); const fileStream = createReadStream(inputFile, { encoding: "utf8" }); return new Promise((resolve, reject) => { - let pendingCount = 0; - const MAX_PENDING = 100; // Smaller limit to reduce memory + const csvStream = parse(); // Sequential processing via pause/resume + let isProcessing = false; + + csvStream + .on("data", async (row: string[]) => { + // PAUSE IMMEDIATELY - process one row at a time + csvStream.pause(); + isProcessing = true; - const csvStream = parse() - .on("data", (row: string[]) => { lineNumber++; - // For the first row, detect column count - if (expectedColumns === null) { - expectedColumns = row.length; - logger.info(`Detected ${expectedColumns} columns using fast-csv`); - } + try { + // Detect column count on first row + if (expectedColumns === null) { + expectedColumns = row.length; + logger.info(`Detected ${expectedColumns} columns - SEQUENTIAL processing mode`); + } - // Log progress synchronously when line is read (not in async callback) - // This ensures logs appear at the correct intervals - if (lineNumber % progressInterval === 0 && lineNumber !== lastLoggedLine) { - const currentTime = Date.now(); - const chunkTime = currentTime - lastLogTime; // Time for this 10k chunk - const totalElapsed = currentTime - startTime; // Total time since start - const chunkTimeSeconds = (chunkTime / 1000).toFixed(2); - const totalTimeSeconds = (totalElapsed / 1000).toFixed(2); - const linesPerSecond = ((progressInterval / chunkTime) * 1000).toFixed(0); - - lastLoggedLine = lineNumber; - lastLogTime = currentTime; - - // Note: processedRecords may be slightly behind due to async processing - logger.info( - `Processed ${lineNumber} lines, written ${processedRecords} records | ` + - `Chunk: ${chunkTimeSeconds}s (${linesPerSecond} lines/sec) | ` + - `Total: ${totalTimeSeconds}s` - ); - } + // Log progress (less frequently to avoid logger crashes) + if (lineNumber % progressInterval === 0 && lineNumber !== lastLoggedLine) { + const currentTime = Date.now(); + const chunkTime = currentTime - lastLogTime; + const totalElapsed = currentTime - startTime; + const chunkTimeSeconds = (chunkTime / 1000).toFixed(2); + const totalTimeSeconds = (totalElapsed / 1000).toFixed(2); + const linesPerSecond = ((progressInterval / chunkTime) * 1000).toFixed(0); + + lastLoggedLine = lineNumber; + lastLogTime = currentTime; + + const memUsage = process.memoryUsage(); + const memInfo = `RSS=${(memUsage.rss / 1024 / 1024).toFixed(0)}MB, Heap=${(memUsage.heapUsed / 1024 / 1024).toFixed(0)}MB`; + + let dedupInfo = ""; + if (dedupDb) { + const dedupStats = dedupDb.getMemoryStats(); + dedupInfo = ` | Dedup: ${dedupStats.pendingWrites}/${dedupStats.cache}`; + } + + // Use console.log instead of logger to avoid worker thread issues + console.log( + `[${new Date().toISOString()}] Line ${lineNumber}, written ${processedRecords} | ` + + `${linesPerSecond} lines/sec | ${memInfo}${dedupInfo}` + ); + } - // Backpressure: pause if too many pending - if (pendingCount >= MAX_PENDING) { - csvStream.pause(); - } + // Process this one record + const wasProcessed = await processRecord( + row, + expectedColumns, + RainbowRecordType, + outputStream, + lineNumber, + existingDb, + dedupDb, + stats, + ); - pendingCount++; - processRecord( - row, - expectedColumns, - RainbowRecordType, - outputStream, - lineNumber, - existingDb, - dedupDb, - stats, - ).then((wasProcessed) => { if (wasProcessed) { processedRecords++; } - - // Update progress bar every 1000 lines + + // Update progress bar if (lineNumber % 1000 === 0 && progressBar) { progressBar.tick(1000); progressBar.curr = lineNumber; } - - pendingCount--; - - // Resume when under threshold - if (csvStream.isPaused() && pendingCount < MAX_PENDING / 2) { - csvStream.resume(); - } - }).catch((error) => { + + // Done processing - resume for next row + isProcessing = false; + csvStream.resume(); + + } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); csvStream.destroy(); fileStream.destroy(); - reject( - new Error( - `CSV conversion failed due to invalid data on line ${lineNumber}: ${errorMessage}`, - ), - ); - }); + reject(new Error(`Failed on line ${lineNumber}: ${errorMessage}`)); + } }) .on("error", (error: Error) => { reject(new Error(`CSV parsing error: ${error.message}`)); }) - .on("end", async () => { - // Wait for all pending to complete - while (pendingCount > 0) { - await new Promise(resolve => setTimeout(resolve, 10)); - } - const dedupStatus = dedupDb ? "LevelDB deduplication completed" : "Deduplication disabled"; - logger.info(dedupStatus); + .on("end", () => { + logger.info(`Sequential processing complete`); resolve({ totalLines: lineNumber, processedRecords }); }); @@ -490,6 +495,7 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom processedRecords: 0, filteredExistingLabels: 0, filteredDuplicates: 0, + outputBackpressureEvents: 0, startTime: new Date(), }; @@ -509,11 +515,16 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom keyEncoding: 'utf8', valueEncoding: 'utf8', createIfMissing: true, + // Aggressive memory limits + cacheSize: 2 * 1024 * 1024, // 2MB block cache (minimal) + writeBufferSize: 4 * 1024 * 1024, // 4MB write buffer (minimal) + maxOpenFiles: 100, // Limit open files + compression: false, // Disable compression to reduce CPU/memory }); await tempDb.open(); dedupDb = new DeduplicationDB( tempDb, - options.cacheSize ?? 10000, + options.cacheSize ?? 1000, // Reduced default from 10000 to 1000 options.useBloomFilter ?? false, options.bloomFilterSize ?? 10000000 ); From 56bc3563a512da001524cd501368dde8816d3118 Mon Sep 17 00:00:00 2001 From: djstrong Date: Mon, 15 Dec 2025 15:44:59 +0100 Subject: [PATCH 11/30] refactor: streamline CSV conversion CLI options and improve logging - Removed unused command-line options for deduplication and Bloom filter from the CLI interface. - Updated default progress interval from 10000 to 50000 records for improved performance. - Enhanced logging for file processing and memory management during CSV conversion. - Cleaned up code for better readability and maintainability. --- apps/ensrainbow/src/cli.ts | 56 ++----- .../src/commands/convert-csv-command.test.ts | 15 +- .../src/commands/convert-csv-command.ts | 141 +++++++----------- .../ensrainbow/concepts/creating-files.mdx | 6 +- 4 files changed, 82 insertions(+), 136 deletions(-) diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index 6e6bb4f32..de84a0963 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -69,11 +69,7 @@ interface ConvertCsvArgs { "label-set-version": LabelSetVersion; "progress-interval"?: number; "existing-db-path"?: string; - "silent"?: boolean; - "disable-dedup"?: boolean; - "cache-size"?: number; - "use-bloom-filter"?: boolean; - "bloom-filter-size"?: number; + silent?: boolean; } export interface CLIOptions { @@ -264,37 +260,17 @@ export function createCLI(options: CLIOptions = {}) { .option("progress-interval", { type: "number", description: "Number of records to process before logging progress", - default: 10000, + default: 50000, }) - .option("existing-db-path", { - type: "string", - description: "Path to existing ENSRainbow database to filter out existing labels", - }) - .option("silent", { - type: "boolean", - description: "Disable progress bar (useful for scripts)", - default: false, - }) - .option("disable-dedup", { - type: "boolean", - description: "Disable deduplication within CSV file (faster but may create duplicates)", - default: false, - }) - .option("cache-size", { - type: "number", - description: "Cache size for deduplication (default: 5000)", - default: 5000, - }) - .option("use-bloom-filter", { - type: "boolean", - description: "Use Bloom filter for faster deduplication (default: false)", - default: false, - }) - .option("bloom-filter-size", { - type: "number", - description: "Expected number of items for Bloom filter (default: 10000000)", - default: 10000000, - }); + .option("existing-db-path", { + type: "string", + description: "Path to existing ENSRainbow database to filter out existing labels", + }) + .option("silent", { + type: "boolean", + description: "Disable progress bar (useful for scripts)", + default: false, + }); }, async (argv: ArgumentsCamelCase) => { await convertCsvCommand({ @@ -302,13 +278,9 @@ export function createCLI(options: CLIOptions = {}) { outputFile: argv["output-file"], labelSetId: argv["label-set-id"], labelSetVersion: argv["label-set-version"], - progressInterval: argv["progress-interval"], - existingDbPath: argv["existing-db-path"], - silent: argv["silent"], - noDedup: argv["disable-dedup"], - cacheSize: argv["cache-size"], - useBloomFilter: argv["use-bloom-filter"], - bloomFilterSize: argv["bloom-filter-size"], + progressInterval: argv["progress-interval"], + existingDbPath: argv["existing-db-path"], + silent: argv["silent"], }); }, ) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index c6ddadb03..4f5b37eb6 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -1,12 +1,15 @@ +import { mkdtemp, rm, stat, writeFile } from "fs/promises"; import { tmpdir } from "os"; import { join } from "path"; -import { mkdtemp, rm, stat, writeFile } from "fs/promises"; + +import { labelhash } from "viem"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { type LabelSetId, type LabelSetVersion, labelHashToBytes } from "@ensnode/ensnode-sdk"; + import { createCLI } from "@/cli"; import { ENSRainbowDB } from "@/lib/database"; -import { type LabelSetId, type LabelSetVersion, labelHashToBytes } from "@ensnode/ensnode-sdk"; -import { labelhash } from "viem"; + import { convertCsvCommand } from "./convert-csv-command"; // Path to test fixtures @@ -406,10 +409,10 @@ describe("convert-csv-command", () => { // Verify file was created const stats = await stat(outputFile); - expect(stats.isFile()).toBe(true); - expect(stats.size).toBeGreaterThan(0); + expect(stats.isFile()).toBe(true); + expect(stats.size).toBeGreaterThan(0); + }); }); -}); describe("Streaming performance", () => { it("should handle small CSV files efficiently", async () => { diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index db7478664..47d790a69 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -5,15 +5,16 @@ * Supports 1-column (label only) and 2-column (label,labelhash) formats */ -import { createReadStream, createWriteStream, statSync } from "fs"; -import { rmSync } from "fs"; +import { createReadStream, createWriteStream, rmSync, statSync } from "fs"; import { join } from "path"; -import { type LabelHash, labelHashToBytes } from "@ensnode/ensnode-sdk"; + import { parse } from "@fast-csv/parse"; -import { labelhash } from "viem"; import { ClassicLevel } from "classic-level"; import ProgressBar from "progress"; -import bloomFilters from "bloom-filters"; +import { labelhash } from "viem"; + +import { type LabelHash, labelHashToBytes } from "@ensnode/ensnode-sdk"; + import { ENSRainbowDB } from "../lib/database.js"; import { logger } from "../utils/logger.js"; import { @@ -28,7 +29,7 @@ function estimateMapMemory(map: Map): number { let total = 0; for (const [key, value] of map) { // Rough estimate: key size + value size + Map overhead (48 bytes per entry) - total += (key.length * 2) + (typeof value === 'string' ? value.length * 2 : 8) + 48; + total += key.length * 2 + (typeof value === "string" ? value.length * 2 : 8) + 48; } return total; } @@ -38,16 +39,9 @@ function estimateMapMemory(map: Map): number { */ class DeduplicationDB { private pendingWrites: Map = new Map(); - private bloomFilter: typeof bloomFilters.BloomFilter | null = null; - constructor(private db: ClassicLevel, cacheSize: number = 1000, useBloomFilter: boolean = false, expectedItems: number = 10000000) { + constructor(private db: ClassicLevel) { // No in-memory cache - LevelDB has its own internal cache - - if (useBloomFilter) { - // Create Bloom filter with 0.1% false positive rate - this.bloomFilter = bloomFilters.BloomFilter.create(expectedItems, 0.01); - logger.info(`Created Bloom filter for ${expectedItems} items (~${(this.bloomFilter.size / 8 / 1024 / 1024).toFixed(2)} MB)`); - } } async has(key: string): Promise { @@ -56,13 +50,6 @@ class DeduplicationDB { return true; } - // Use Bloom filter if available (skip expensive DB lookup) - if (this.bloomFilter) { - if (!this.bloomFilter.has(key)) { - return false; - } - } - // Check database (LevelDB has its own internal cache) try { await this.db.get(key); @@ -74,12 +61,7 @@ class DeduplicationDB { async add(key: string, value: string): Promise { this.pendingWrites.set(key, value); - - // Add to Bloom filter if available - if (this.bloomFilter) { - this.bloomFilter.add(key); - } - + // Flush frequently to keep pendingWrites small if (this.pendingWrites.size >= 1000) { await this.flush(); @@ -95,7 +77,7 @@ class DeduplicationDB { } await batch.write(); this.pendingWrites.clear(); - + // Hint to garbage collector after large batch if (global.gc) { global.gc(); @@ -107,7 +89,12 @@ class DeduplicationDB { await this.db.close(); } - getMemoryStats(): { pendingWrites: number; cache: number; pendingWritesMB: number; cacheMB: number } { + getMemoryStats(): { + pendingWrites: number; + cache: number; + pendingWritesMB: number; + cacheMB: number; + } { return { pendingWrites: this.pendingWrites.size, cache: 0, // Cache disabled - using LevelDB's internal cache @@ -117,20 +104,16 @@ class DeduplicationDB { } } - /** * Sets up a simple progress bar that shows speed without total count. */ function setupProgressBar(): ProgressBar { - return new ProgressBar( - "Processing CSV [:bar] :current lines - :rate lines/sec", - { - complete: "=", - incomplete: " ", - width: 40, - total: 200000000, // Very large total for big files - }, - ); + return new ProgressBar("Processing CSV [:bar] :current lines - :rate lines/sec", { + complete: "=", + incomplete: " ", + width: 40, + total: 200000000, // Very large total for big files + }); } /** @@ -144,10 +127,6 @@ export interface ConvertCsvCommandOptions { progressInterval?: number; existingDbPath?: string; // Path to existing ENSRainbow database to check for existing labels silent?: boolean; // Disable progress bar for tests - noDedup?: boolean; // Disable deduplication within CSV file - cacheSize?: number; // Cache size for deduplication (default: 10000) - useBloomFilter?: boolean; // Use Bloom filter for faster deduplication (default: false) - bloomFilterSize?: number; // Expected number of items for Bloom filter (default: 10000000) } // Configuration constants @@ -242,8 +221,9 @@ async function initializeConversion(options: ConvertCsvCommandOptions) { const stats = statSync(options.inputFile); const fileSizeMB = (stats.size / (1024 * 1024)).toFixed(2); logger.info(`Input file size: ${fileSizeMB} MB`); - - if (stats.size > 1024 * 1024 * 1024) { // > 1GB + + if (stats.size > 1024 * 1024 * 1024) { + // > 1GB logger.warn("⚠️ Processing a very large file - using SEQUENTIAL mode."); logger.warn("💡 Use --existing-db-path to filter existing labels and speed up processing."); } @@ -352,7 +332,7 @@ async function processRecord( // Create protobuf message and write with backpressure handling const recordMessage = RainbowRecordType.fromObject(rainbowRecord); const buffer = Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish()); - + // Check if write returns false (buffer full) - if so, wait for drain const canContinue = outputStream.write(buffer); if (!canContinue) { @@ -361,7 +341,7 @@ async function processRecord( // Wait for drain event before continuing // Note: The CSV stream should be paused by the caller when backpressure is detected await new Promise((resolve) => { - outputStream.once('drain', resolve); + outputStream.once("drain", resolve); }); } @@ -417,23 +397,23 @@ async function processCSVFile( const chunkTimeSeconds = (chunkTime / 1000).toFixed(2); const totalTimeSeconds = (totalElapsed / 1000).toFixed(2); const linesPerSecond = ((progressInterval / chunkTime) * 1000).toFixed(0); - + lastLoggedLine = lineNumber; lastLogTime = currentTime; - + const memUsage = process.memoryUsage(); const memInfo = `RSS=${(memUsage.rss / 1024 / 1024).toFixed(0)}MB, Heap=${(memUsage.heapUsed / 1024 / 1024).toFixed(0)}MB`; - + let dedupInfo = ""; if (dedupDb) { const dedupStats = dedupDb.getMemoryStats(); dedupInfo = ` | Dedup: ${dedupStats.pendingWrites}/${dedupStats.cache}`; } - + // Use console.log instead of logger to avoid worker thread issues console.log( `[${new Date().toISOString()}] Line ${lineNumber}, written ${processedRecords} | ` + - `${linesPerSecond} lines/sec | ${memInfo}${dedupInfo}` + `${linesPerSecond} lines/sec | ${memInfo}${dedupInfo}`, ); } @@ -462,7 +442,6 @@ async function processCSVFile( // Done processing - resume for next row isProcessing = false; csvStream.resume(); - } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); csvStream.destroy(); @@ -507,30 +486,21 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom const { RainbowRecordType, outputStream, existingDb: db } = await initializeConversion(options); existingDb = db; - // Create temporary deduplication database (if not disabled) - if (!options.noDedup) { - tempDedupDir = join(process.cwd(), 'temp-dedup-' + Date.now()); - logger.info(`Creating temporary deduplication database at: ${tempDedupDir}`); - const tempDb = new ClassicLevel(tempDedupDir, { - keyEncoding: 'utf8', - valueEncoding: 'utf8', - createIfMissing: true, - // Aggressive memory limits - cacheSize: 2 * 1024 * 1024, // 2MB block cache (minimal) - writeBufferSize: 4 * 1024 * 1024, // 4MB write buffer (minimal) - maxOpenFiles: 100, // Limit open files - compression: false, // Disable compression to reduce CPU/memory - }); - await tempDb.open(); - dedupDb = new DeduplicationDB( - tempDb, - options.cacheSize ?? 1000, // Reduced default from 10000 to 1000 - options.useBloomFilter ?? false, - options.bloomFilterSize ?? 10000000 - ); - } else { - logger.info("Deduplication disabled - processing all records"); - } + // Create temporary deduplication database + tempDedupDir = join(process.cwd(), "temp-dedup-" + Date.now()); + logger.info(`Creating temporary deduplication database at: ${tempDedupDir}`); + const tempDb = new ClassicLevel(tempDedupDir, { + keyEncoding: "utf8", + valueEncoding: "utf8", + createIfMissing: true, + // Aggressive memory limits + cacheSize: 2 * 1024 * 1024, // 2MB block cache (minimal) + writeBufferSize: 4 * 1024 * 1024, // 4MB write buffer (minimal) + maxOpenFiles: 100, // Limit open files + compression: false, // Disable compression to reduce CPU/memory + }); + await tempDb.open(); + dedupDb = new DeduplicationDB(tempDb); const progressInterval = options.progressInterval ?? DEFAULT_PROGRESS_INTERVAL; @@ -549,16 +519,15 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom progressBar, ); - stats.totalLines = totalLines; - stats.processedRecords = processedRecords; + stats.totalLines = totalLines; + stats.processedRecords = processedRecords; - // Log final progress for large files - if (totalLines > 10_000) { - const dedupStatus = options.noDedup ? "dedup disabled" : "LevelDB dedup active"; - logger.info( - `✅ Completed processing ${totalLines.toLocaleString()} lines, wrote ${processedRecords.toLocaleString()} records (${dedupStatus})`, - ); - } + // Log final progress for large files + if (totalLines > 10_000) { + logger.info( + `✅ Completed processing ${totalLines.toLocaleString()} lines, wrote ${processedRecords.toLocaleString()} records (LevelDB dedup active)`, + ); + } // Close output stream outputStream.end(); diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx index 125e9916a..2d9ec8c10 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -124,7 +124,8 @@ pnpm run convert-csv \ --label-set-id \ --label-set-version \ [--progress-interval ] \ - [--existing-db-path ] + [--existing-db-path ] \ + [--silent] ``` ### Required Parameters @@ -136,8 +137,9 @@ pnpm run convert-csv \ ### Optional Parameters - `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`) -- `--progress-interval`: Progress logging frequency (default: 10000 records) +- `--progress-interval`: Progress logging frequency (default: 50000 records) - `--existing-db-path`: Path to existing ENSRainbow database to filter out existing labels +- `--silent`: Disable progress bar (useful for scripts and automated workflows) ### CSV Format Support From 11992d7abab25e36184d504bd9a82ab660b9e111 Mon Sep 17 00:00:00 2001 From: djstrong Date: Mon, 15 Dec 2025 16:33:19 +0100 Subject: [PATCH 12/30] fix: improve error handling and logging in CSV conversion tests --- .../src/commands/convert-csv-command.test.ts | 4 +- .../src/commands/convert-csv-command.ts | 47 +++++++++++++++---- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index 4f5b37eb6..f3e85f6fa 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -110,7 +110,7 @@ describe("convert-csv-command", () => { labelSetId: "test-csv-invalid" as LabelSetId, labelSetVersion: 0 as LabelSetVersion, }), - ).rejects.toThrow(/CSV conversion failed due to invalid data/); + ).rejects.toThrow(/Failed on line 1: Invalid labelHash/); }); it("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => { @@ -167,7 +167,7 @@ describe("convert-csv-command", () => { labelSetId: "test-csv-invalid-hash" as LabelSetId, labelSetVersion: 0 as LabelSetVersion, }), - ).rejects.toThrow(/CSV conversion failed due to invalid data/); + ).rejects.toThrow(/Failed on line 2: Invalid labelHash/); }); }); diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 47d790a69..3a0f14d84 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -274,13 +274,22 @@ function createRainbowRecord(row: string[]): { labelhash: Buffer; label: string }; } else { // Two columns: validate and use provided hash - const providedHash = String(row[1]); + // Trim whitespace from hash (metadata), but preserve label as-is + const providedHash = String(row[1]).trim(); + if (providedHash === "") { + throw new Error("LabelHash cannot be empty"); + } const maybeLabelHash = providedHash.startsWith("0x") ? providedHash : `0x${providedHash}`; - const labelHash = labelHashToBytes(maybeLabelHash as LabelHash); - return { - labelhash: Buffer.from(labelHash), - label: label, - }; + try { + const labelHash = labelHashToBytes(maybeLabelHash as LabelHash); + return { + labelhash: Buffer.from(labelHash), + label: label, + }; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + throw new Error(`Invalid labelHash: ${errorMessage}`); + } } } @@ -373,6 +382,14 @@ async function processCSVFile( return new Promise((resolve, reject) => { const csvStream = parse(); // Sequential processing via pause/resume let isProcessing = false; + let streamEnded = false; + + const checkAndResolve = () => { + if (streamEnded && !isProcessing) { + logger.info(`Sequential processing complete`); + resolve({ totalLines: lineNumber, processedRecords }); + } + }; csvStream .on("data", async (row: string[]) => { @@ -383,7 +400,16 @@ async function processCSVFile( lineNumber++; try { - // Detect column count on first row + // Skip empty rows (no columns or all empty strings) + const isEmptyRow = row.length === 0 || row.every((cell) => cell === ""); + if (isEmptyRow) { + isProcessing = false; + csvStream.resume(); + checkAndResolve(); + return; + } + + // Detect column count on first non-empty row if (expectedColumns === null) { expectedColumns = row.length; logger.info(`Detected ${expectedColumns} columns - SEQUENTIAL processing mode`); @@ -442,6 +468,7 @@ async function processCSVFile( // Done processing - resume for next row isProcessing = false; csvStream.resume(); + checkAndResolve(); } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); csvStream.destroy(); @@ -453,8 +480,8 @@ async function processCSVFile( reject(new Error(`CSV parsing error: ${error.message}`)); }) .on("end", () => { - logger.info(`Sequential processing complete`); - resolve({ totalLines: lineNumber, processedRecords }); + streamEnded = true; + checkAndResolve(); }); fileStream @@ -537,7 +564,7 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom logger.info("✅ CSV conversion completed successfully!"); } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); - logger.error("❌ CSV conversion failed:", errorMessage); + logger.error(`❌ CSV conversion failed: ${errorMessage}`); throw error; } finally { // Clean up deduplication database From 3dea60ecf687863f3de34a3589512720618373e7 Mon Sep 17 00:00:00 2001 From: djstrong Date: Tue, 16 Dec 2025 21:59:20 +0100 Subject: [PATCH 13/30] refactor: update CSV conversion logic and improve deduplication handling --- .../src/commands/convert-csv-command.ts | 44 +++++++++---------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 3a0f14d84..4770a0b1e 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -112,7 +112,7 @@ function setupProgressBar(): ProgressBar { complete: "=", incomplete: " ", width: 40, - total: 200000000, // Very large total for big files + total: 300000000, // Very large total for big files }); } @@ -225,7 +225,6 @@ async function initializeConversion(options: ConvertCsvCommandOptions) { if (stats.size > 1024 * 1024 * 1024) { // > 1GB logger.warn("⚠️ Processing a very large file - using SEQUENTIAL mode."); - logger.warn("💡 Use --existing-db-path to filter existing labels and speed up processing."); } } catch (error) { logger.warn(`Could not determine file size: ${error}`); @@ -303,7 +302,7 @@ async function processRecord( outputStream: NodeJS.WritableStream, lineNumber: number, existingDb: ENSRainbowDB | null, - dedupDb: DeduplicationDB | null, + dedupDb: DeduplicationDB, stats: ConversionStats, ): Promise { // Validate column count @@ -326,18 +325,16 @@ async function processRecord( } } - // Check if label is a duplicate within this conversion using LevelDB (if enabled) - if (dedupDb) { - const existsInDedupDb = await dedupDb.has(label); - if (existsInDedupDb) { - stats.filteredDuplicates++; - return false; // Skip this record - } - - // Add label to deduplication database - await dedupDb.add(label, ""); + // Check if label is a duplicate within this conversion using LevelDB + const existsInDedupDb = await dedupDb.has(label); + if (existsInDedupDb) { + stats.filteredDuplicates++; + return false; // Skip this record } + // Add label to deduplication database + await dedupDb.add(label, ""); + // Create protobuf message and write with backpressure handling const recordMessage = RainbowRecordType.fromObject(rainbowRecord); const buffer = Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish()); @@ -366,7 +363,7 @@ async function processCSVFile( outputStream: NodeJS.WritableStream, progressInterval: number, existingDb: ENSRainbowDB | null, - dedupDb: DeduplicationDB | null, + dedupDb: DeduplicationDB, stats: ConversionStats, progressBar: ProgressBar | null, ): Promise<{ totalLines: number; processedRecords: number }> { @@ -419,9 +416,6 @@ async function processCSVFile( if (lineNumber % progressInterval === 0 && lineNumber !== lastLoggedLine) { const currentTime = Date.now(); const chunkTime = currentTime - lastLogTime; - const totalElapsed = currentTime - startTime; - const chunkTimeSeconds = (chunkTime / 1000).toFixed(2); - const totalTimeSeconds = (totalElapsed / 1000).toFixed(2); const linesPerSecond = ((progressInterval / chunkTime) * 1000).toFixed(0); lastLoggedLine = lineNumber; @@ -430,11 +424,8 @@ async function processCSVFile( const memUsage = process.memoryUsage(); const memInfo = `RSS=${(memUsage.rss / 1024 / 1024).toFixed(0)}MB, Heap=${(memUsage.heapUsed / 1024 / 1024).toFixed(0)}MB`; - let dedupInfo = ""; - if (dedupDb) { - const dedupStats = dedupDb.getMemoryStats(); - dedupInfo = ` | Dedup: ${dedupStats.pendingWrites}/${dedupStats.cache}`; - } + const dedupStats = dedupDb.getMemoryStats(); + const dedupInfo = ` | Dedup: ${dedupStats.pendingWrites}/${dedupStats.cache}`; // Use console.log instead of logger to avoid worker thread issues console.log( @@ -496,6 +487,11 @@ async function processCSVFile( * Main CSV conversion command with true streaming using fast-csv */ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Promise { + // Validate that existingDbPath is provided when labelSetVersion > 0 + if (options.labelSetVersion > 0 && !options.existingDbPath) { + throw new Error("existingDbPath must be specified if label set version is higher than 0"); + } + const stats: ConversionStats = { totalLines: 0, processedRecords: 0, @@ -506,7 +502,7 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom }; let existingDb: ENSRainbowDB | null = null; - let dedupDb: DeduplicationDB | null = null; + let dedupDb: DeduplicationDB | undefined; let tempDedupDir: string | null = null; try { @@ -568,7 +564,7 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom throw error; } finally { // Clean up deduplication database - if (dedupDb) { + if (dedupDb !== undefined) { try { await dedupDb.close(); logger.info("Closed deduplication database"); From b02b7f17cbd261af372f831061a9f6e75266a76d Mon Sep 17 00:00:00 2001 From: djstrong Date: Wed, 17 Dec 2025 16:48:52 +0100 Subject: [PATCH 14/30] refactor: remove unused dependencies and enhance CSV conversion tests --- apps/ensrainbow/package.json | 1 - .../src/commands/convert-csv-command.test.ts | 204 ++++++++++++++++++ .../src/commands/convert-csv-command.ts | 18 +- pnpm-lock.yaml | 51 ----- 4 files changed, 215 insertions(+), 59 deletions(-) diff --git a/apps/ensrainbow/package.json b/apps/ensrainbow/package.json index 387e79cbd..024d6f567 100644 --- a/apps/ensrainbow/package.json +++ b/apps/ensrainbow/package.json @@ -33,7 +33,6 @@ "@ensnode/ensrainbow-sdk": "workspace:*", "@ensnode/ensnode-sdk": "workspace:*", "@hono/node-server": "^1.4.1", - "bloom-filters": "^3.0.4", "classic-level": "^1.4.1", "hono": "catalog:", "pino": "catalog:", diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index f3e85f6fa..e45c8712c 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -490,4 +490,208 @@ describe("convert-csv-command", () => { expect(stats.size).toBeGreaterThan(0); }, 60000); // 60 second timeout for large file test }); + + describe("Edge cases", () => { + it("should handle empty CSV file", async () => { + const inputFile = join(tempDir, "empty.csv"); + const outputFile = join(tempDir, "output_empty.ensrainbow"); + await writeFile(inputFile, ""); + + // Should not throw error for empty file + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-empty" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + silent: true, + }), + ).resolves.not.toThrow(); + + // Verify the output file was created (should have header only) + const stats = await stat(outputFile); + expect(stats.isFile()).toBe(true); + expect(stats.size).toBeGreaterThan(0); + + // Ingest and verify no records were written + const dataDir = join(tempDir, "db_empty"); + const cli = createCLI({ exitProcess: false }); + await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]); + + const db = await ENSRainbowDB.open(dataDir); + expect(await db.validate()).toBe(true); + const recordsCount = await db.getPrecalculatedRainbowRecordCount(); + expect(recordsCount).toBe(0); + await db.close(); + }); + + it("should handle CSV file with only whitespace", async () => { + const inputFile = join(tempDir, "whitespace.csv"); + const outputFile = join(tempDir, "output_whitespace.ensrainbow"); + await writeFile(inputFile, " \n \n\t\n "); + + // Should not throw error for whitespace-only file + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-whitespace" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + silent: true, + }), + ).resolves.not.toThrow(); + + // Verify the output file was created + const stats = await stat(outputFile); + expect(stats.isFile()).toBe(true); + }); + + it("should skip CSV header row if present", async () => { + const inputFile = join(tempDir, "with_header.csv"); + const outputFile = join(tempDir, "output_header.ensrainbow"); + const csvContent = + "label,labelhash\nalice,0x9c0257114eb9399a2985f8e75dad7600c5d89fe3824ffa99ec1c3eb8bf3b0501\nbob,0x38e47a7b719dce63662aeaf43440326f551b8a7ee198cee35cb5d517f2d296a2"; + await writeFile(inputFile, csvContent); + + // Should process the file (header will be treated as a regular row and fail validation) + // Actually, the header row will be processed and fail because "label" is not a valid hex hash + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-header" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + silent: true, + }), + ).rejects.toThrow(/Invalid labelHash/); + + // For a proper test, let's create a CSV where the header is valid data + const csvContentValid = "label\nlabel1\nlabel2"; + await writeFile(inputFile, csvContentValid); + + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-header-valid" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + silent: true, + }), + ).resolves.not.toThrow(); + + // Verify records were created (including "label" as a label) + const dataDir = join(tempDir, "db_header"); + const cli = createCLI({ exitProcess: false }); + await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]); + + const db = await ENSRainbowDB.open(dataDir); + expect(await db.validate()).toBe(true); + const recordsCount = await db.getPrecalculatedRainbowRecordCount(); + // Should have 3 records: "label", "label1", "label2" + expect(recordsCount).toBe(3); + await db.close(); + }); + + it("should handle CSV with malformed rows (extra columns)", async () => { + const inputFile = join(tempDir, "malformed_extra_cols.csv"); + const outputFile = join(tempDir, "output_malformed.ensrainbow"); + const csvContent = + "alice\nbob,0x38e47a7b719dce63662aeaf43440326f551b8a7ee198cee35cb5d517f2d296a2,extra\ncharlie"; + await writeFile(inputFile, csvContent); + + // Should fail when column count is inconsistent + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-malformed" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + silent: true, + }), + ).rejects.toThrow(/Expected \d+ columns/); + }); + + it("should handle CSV with malformed rows (missing columns)", async () => { + const inputFile = join(tempDir, "malformed_missing_cols.csv"); + const outputFile = join(tempDir, "output_malformed2.ensrainbow"); + const csvContent = + "alice,0x9c0257114eb9399a2985f8e75dad7600c5d89fe3824ffa99ec1c3eb8bf3b0501\nbob\ncharlie,0x87a213ce1ee769e28decedefb98f6fe48890a74ba84957ebf877fb591e37e0de"; + await writeFile(inputFile, csvContent); + + // Should fail when column count is inconsistent + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-malformed2" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + silent: true, + }), + ).rejects.toThrow(/Expected \d+ columns/); + }); + + it("should handle CSV with quoted fields containing commas", async () => { + const inputFile = join(tempDir, "quoted_fields.csv"); + const outputFile = join(tempDir, "output_quoted.ensrainbow"); + // CSV with quoted fields that contain commas - use single column format to auto-compute hashes + const csvContent = '"label,with,commas"\n"another,label"'; + await writeFile(inputFile, csvContent); + + // Should handle quoted fields correctly + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-quoted" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + silent: true, + }), + ).resolves.not.toThrow(); + + // Verify the output file was created + const stats = await stat(outputFile); + expect(stats.isFile()).toBe(true); + expect(stats.size).toBeGreaterThan(0); + + // Ingest and verify records + const dataDir = join(tempDir, "db_quoted"); + const cli = createCLI({ exitProcess: false }); + await cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", dataDir]); + + const db = await ENSRainbowDB.open(dataDir); + expect(await db.validate()).toBe(true); + const recordsCount = await db.getPrecalculatedRainbowRecordCount(); + expect(recordsCount).toBe(2); + + // Verify the labels were stored correctly + const label1 = "label,with,commas"; + const label2 = "another,label"; + expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash(label1))))?.label).toBe( + label1, + ); + expect((await db.getVersionedRainbowRecord(labelHashToBytes(labelhash(label2))))?.label).toBe( + label2, + ); + await db.close(); + }); + + it("should handle CSV with empty labelhash column (should fail validation)", async () => { + const inputFile = join(tempDir, "empty_hash.csv"); + const outputFile = join(tempDir, "output_empty_hash.ensrainbow"); + const csvContent = + "alice,0x9c0257114eb9399a2985f8e75dad7600c5d89fe3824ffa99ec1c3eb8bf3b0501\nbob,\ncharlie,0x87a213ce1ee769e28decedefb98f6fe48890a74ba84957ebf877fb591e37e0de"; + await writeFile(inputFile, csvContent); + + // Should fail when labelhash is empty + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-empty-hash" as LabelSetId, + labelSetVersion: 0 as LabelSetVersion, + silent: true, + }), + ).rejects.toThrow(/LabelHash cannot be empty/); + }); + }); }); diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 4770a0b1e..87995971e 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -63,7 +63,7 @@ class DeduplicationDB { this.pendingWrites.set(key, value); // Flush frequently to keep pendingWrites small - if (this.pendingWrites.size >= 1000) { + if (this.pendingWrites.size >= DEDUP_PENDING_WRITES_FLUSH_THRESHOLD) { await this.flush(); } } @@ -112,7 +112,7 @@ function setupProgressBar(): ProgressBar { complete: "=", incomplete: " ", width: 40, - total: 300000000, // Very large total for big files + total: PROGRESS_BAR_LARGE_TOTAL, }); } @@ -131,6 +131,11 @@ export interface ConvertCsvCommandOptions { // Configuration constants const DEFAULT_PROGRESS_INTERVAL = 50000; // Increased from 10k to 50k to reduce logging load +const PROGRESS_BAR_LARGE_TOTAL = 300_000_000; // Very large total for progress bar to handle big files +const DEDUP_PENDING_WRITES_FLUSH_THRESHOLD = 1000; // Flush deduplication DB when pending writes reach this count +const OUTPUT_STREAM_BUFFER_SIZE = 16 * 1024; // 16KB buffer - very small to catch backpressure early +const LARGE_FILE_SIZE_THRESHOLD_MB = 1024; // 1GB - warn user about very large files +const PROGRESS_BAR_UPDATE_INTERVAL = 1000; // Update progress bar every N lines interface ConversionStats { totalLines: number; @@ -150,7 +155,7 @@ function setupWriteStream(outputFile: string) { // This prevents unbounded buffer growth when writes are faster than disk I/O // Smaller buffer = more frequent backpressure = better memory control return createWriteStream(outputFile, { - highWaterMark: 16 * 1024, // 16KB buffer - very small to catch backpressure early + highWaterMark: OUTPUT_STREAM_BUFFER_SIZE, }); } @@ -222,8 +227,7 @@ async function initializeConversion(options: ConvertCsvCommandOptions) { const fileSizeMB = (stats.size / (1024 * 1024)).toFixed(2); logger.info(`Input file size: ${fileSizeMB} MB`); - if (stats.size > 1024 * 1024 * 1024) { - // > 1GB + if (stats.size > LARGE_FILE_SIZE_THRESHOLD_MB * 1024 * 1024) { logger.warn("⚠️ Processing a very large file - using SEQUENTIAL mode."); } } catch (error) { @@ -451,8 +455,8 @@ async function processCSVFile( } // Update progress bar - if (lineNumber % 1000 === 0 && progressBar) { - progressBar.tick(1000); + if (lineNumber % PROGRESS_BAR_UPDATE_INTERVAL === 0 && progressBar) { + progressBar.tick(PROGRESS_BAR_UPDATE_INTERVAL); progressBar.curr = lineNumber; } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b38eaba1b..2d4600d82 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -462,9 +462,6 @@ importers: '@hono/node-server': specifier: ^1.4.1 version: 1.19.5(hono@4.10.3) - bloom-filters: - specifier: ^3.0.4 - version: 3.0.4 classic-level: specifier: ^1.4.1 version: 1.4.1 @@ -3438,9 +3435,6 @@ packages: '@types/sax@1.2.7': resolution: {integrity: sha512-rO73L89PJxeYM3s3pPPjiPgVVcymqU490g0YO5n5By0k2Erzj6tay/4lr1CHAAU4JyOWd1rpQ8bCf6cZfHU96A==} - '@types/seedrandom@3.0.8': - resolution: {integrity: sha512-TY1eezMU2zH2ozQoAFAQFOPpvP15g+ZgSfTZt31AUUH/Rxtnz3H+A/Sv1Snw2/amp//omibc+AEkTaA8KUeOLQ==} - '@types/tar@6.1.13': resolution: {integrity: sha512-IznnlmU5f4WcGTh2ltRu/Ijpmk8wiWXfF0VA4s+HPjHZgvFggk1YaIkbo5krX/zUCzWF8N/l4+W/LNxnvAJ8nw==} @@ -3789,10 +3783,6 @@ packages: base-64@1.0.0: resolution: {integrity: sha512-kwDPIFCGx0NZHog36dj+tHiwP4QMzsZ3AgMViUBKI0+V5n4U0ufTCUMhnQ04diaRI8EX/QcPfql7zlhZ7j4zgg==} - base64-arraybuffer@1.0.2: - resolution: {integrity: sha512-I3yl4r9QB5ZRY3XuJVEPfc2XhZO6YweFPI+UovAzn+8/hb3oJ6lnysaFcjVpkCPfVWFUDvoZ8kmVDP7WyRtYtQ==} - engines: {node: '>= 0.6.0'} - base64-js@1.5.1: resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} @@ -3823,10 +3813,6 @@ packages: bintrees@1.0.2: resolution: {integrity: sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==} - bloom-filters@3.0.4: - resolution: {integrity: sha512-BdnPWo2OpYhlvuP2fRzJBdioMCkm7Zp0HCf8NJgF5Mbyqy7VQ/CnTiVWMMyq4EZCBHwj0Kq6098gW2/3RsZsrA==} - engines: {node: '>=12'} - boolbase@1.0.0: resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==} @@ -4181,9 +4167,6 @@ packages: csstype@3.2.3: resolution: {integrity: sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==} - cuint@0.2.2: - resolution: {integrity: sha512-d4ZVpCW31eWwCMe1YT3ur7mUDnTXbgwyzaL320DrcRT45rfjYxkt5QWLrmOJ+/UEAI2+fQgKe/fCjR8l4TpRgw==} - cytoscape-cose-bilkent@4.1.0: resolution: {integrity: sha512-wgQlVIUJF13Quxiv5e1gstZ08rnZj2XaLHGoFMYXz7SkNfCDOOteKBE6SYRfA9WxxI/iBc3ajfDoc6hb/MRAHQ==} peerDependencies: @@ -6554,9 +6537,6 @@ packages: recma-stringify@1.0.0: resolution: {integrity: sha512-cjwII1MdIIVloKvC9ErQ+OgAtwHBmcZ0Bg4ciz78FtbT8In39aAYbaA7zvxQ61xVMSPE8WxhLwLbhif4Js2C+g==} - reflect-metadata@0.1.14: - resolution: {integrity: sha512-ZhYeb6nRaXCfhnndflDK8qI6ZQ/YcWZCISRAWICW9XYqMUwjZM9Z0DveWX/ABN01oxSHwVxKQmxeYZSsm0jh5A==} - regex-recursion@6.0.2: resolution: {integrity: sha512-0YCaSCq2VRIebiaUviZNs0cBz1kg5kVS2UKUfNIx8YVs1cN3AV7NTctO5FOKBA+UT2BPJIWZauYHPqJODG50cg==} @@ -6728,9 +6708,6 @@ packages: secure-json-parse@4.1.0: resolution: {integrity: sha512-l4KnYfEyqYJxDwlNVyRfO2E4NTHfMKAWdUuA8J0yve2Dz/E/PdBepY03RvyJpssIpRFwJoCD55wA+mEDs6ByWA==} - seedrandom@3.0.5: - resolution: {integrity: sha512-8OwmbklUNzwezjGInmZ+2clQmExQPvomqjL7LFqOYqtmuxRgQYqOD3mHaU+MvZn5FLUeVxVfQjwLZW/n/JFuqg==} - semver-compare@1.0.0: resolution: {integrity: sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow==} @@ -7790,9 +7767,6 @@ packages: xxhash-wasm@1.1.0: resolution: {integrity: sha512-147y/6YNh+tlp6nd/2pWq38i9h6mz/EuQ6njIrmW8D1BS5nCqs0P6DG+m6zTGnNz5I+uhZ0SHxBs9BsPrwcKDA==} - xxhashjs@0.2.2: - resolution: {integrity: sha512-AkTuIuVTET12tpsVIQo+ZU6f/qDmKuRUcjaqR+OIvm+aCBsZ95i7UVY5WJ9TMsSaZ0DA2WxoZ4acu0sPH+OKAw==} - y18n@5.0.8: resolution: {integrity: sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==} engines: {node: '>=10'} @@ -10844,8 +10818,6 @@ snapshots: dependencies: '@types/node': 22.18.13 - '@types/seedrandom@3.0.8': {} - '@types/tar@6.1.13': dependencies: '@types/node': 22.18.13 @@ -11322,8 +11294,6 @@ snapshots: base-64@1.0.0: {} - base64-arraybuffer@1.0.2: {} - base64-js@1.5.1: {} baseline-browser-mapping@2.8.21: {} @@ -11350,17 +11320,6 @@ snapshots: bintrees@1.0.2: {} - bloom-filters@3.0.4: - dependencies: - '@types/seedrandom': 3.0.8 - base64-arraybuffer: 1.0.2 - is-buffer: 2.0.5 - lodash: 4.17.21 - long: 5.3.2 - reflect-metadata: 0.1.14 - seedrandom: 3.0.5 - xxhashjs: 0.2.2 - boolbase@1.0.0: {} boring-avatars@1.11.2: {} @@ -11736,8 +11695,6 @@ snapshots: csstype@3.2.3: {} - cuint@0.2.2: {} - cytoscape-cose-bilkent@4.1.0(cytoscape@3.33.1): dependencies: cose-base: 1.0.3 @@ -14485,8 +14442,6 @@ snapshots: unified: 11.0.5 vfile: 6.0.3 - reflect-metadata@0.1.14: {} - regex-recursion@6.0.2: dependencies: regex-utilities: 2.3.0 @@ -14753,8 +14708,6 @@ snapshots: secure-json-parse@4.1.0: {} - seedrandom@3.0.5: {} - semver-compare@1.0.0: {} semver@6.3.1: {} @@ -15861,10 +15814,6 @@ snapshots: xxhash-wasm@1.1.0: {} - xxhashjs@0.2.2: - dependencies: - cuint: 0.2.2 - y18n@5.0.8: {} yallist@3.1.1: {} From 35a05cb08d576053f1b4192beac4b2597bc8f30c Mon Sep 17 00:00:00 2001 From: "kwrobel.eth" Date: Mon, 5 Jan 2026 14:19:28 +0100 Subject: [PATCH 15/30] Apply suggestions from code review Co-authored-by: lightwalker.eth <126201998+lightwalker-eth@users.noreply.github.com> --- .changeset/brave-kiwis-notice.md | 2 +- apps/ensrainbow/src/cli.ts | 8 ++++---- .../ensrainbow/concepts/creating-files.mdx | 19 ++++++++----------- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/.changeset/brave-kiwis-notice.md b/.changeset/brave-kiwis-notice.md index fbdba8bfc..a514e5684 100644 --- a/.changeset/brave-kiwis-notice.md +++ b/.changeset/brave-kiwis-notice.md @@ -2,4 +2,4 @@ "ensrainbow": patch --- -feat: add CSV conversion command to ensrainbow CLI +feat: add CSV conversion command to ensrainbow CLI to convert rainbow tables from CSV format to ensrainbow format diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index de84a0963..b010bed15 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -242,18 +242,18 @@ export function createCLI(options: CLIOptions = {}) { }) .option("output-file", { type: "string", - description: "Path to the output ensrainbow file", + description: "Path to where the resulting ensrainbow file will be output", default: join(process.cwd(), "rainbow-records.ensrainbow"), }) .option("label-set-id", { type: "string", - description: "Label set id for the rainbow record collection", + description: "Label set id for the generated ensrainbow file", demandOption: true, }) .coerce("label-set-id", buildLabelSetId) .option("label-set-version", { type: "number", - description: "Label set version for the rainbow record collection", + description: "Label set version for the generated ensrainbow file", demandOption: true, }) .coerce("label-set-version", buildLabelSetVersion) @@ -264,7 +264,7 @@ export function createCLI(options: CLIOptions = {}) { }) .option("existing-db-path", { type: "string", - description: "Path to existing ENSRainbow database to filter out existing labels", + description: "Path to existing ENSRainbow database to filter out existing labels from the generated ensrainbow file", }) .option("silent", { type: "boolean", diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx index 2d9ec8c10..d914a344d 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -1,10 +1,10 @@ --- title: Creating ENSRainbow Files -description: Complete guide to creating .ensrainbow files from SQL dumps and CSV data. +description: Complete guide to creating .ensrainbow files. sidebar: label: Creating Files order: 3 -keywords: [ensrainbow, file creation, conversion, sql, csv] +keywords: [ensrainbow, file creation, conversion, csv] --- ENSRainbow provides two methods for creating `.ensrainbow` files from different data sources. This guide helps you choose the right method and provides step-by-step instructions. @@ -42,20 +42,17 @@ For detailed information about the file format structure, see the [Data Model](/ | Method | Input Format | Use Case | Command | |--------|-------------|----------|---------| | **SQL Conversion** | Gzipped SQL dump (`ens_names.sql.gz`) | Converting legacy ENS Subgraph data | `pnpm run convert` | -| **CSV Conversion** | CSV file (1 or 2 columns) | Custom datasets, test data, external sources | `pnpm run convert-csv` | +| **CSV Conversion** | CSV file (1 or 2 columns) | Building new ENS rainbow tables | `pnpm run convert-csv` | ### When to Use SQL Conversion - Converting existing ENS Subgraph rainbow tables -- Working with legacy `ens_names.sql.gz` files +- Working with the legacy `ens_names.sql.gz` file. - Migrating from previous ENS data formats ### When to Use CSV Conversion -- Creating test datasets -- Converting data from external sources -- Working with custom label collections -- Building incremental label sets +- Creating new rainbow tables for ENSRainbow ## Method 1: Converting from SQL Dumps @@ -275,7 +272,7 @@ pnpm run ingest-ensrainbow \ pnpm run serve --data-dir data-test-env --port 3223 ``` -### Workflow 3: Building Custom Dataset +### Workflow 3: Create a new Labelset ```bash # 1. Create CSV with your labels @@ -298,10 +295,10 @@ pnpm run ingest-ensrainbow \ pnpm run serve --data-dir data-custom --port 3223 ``` -### Workflow 4: Creating Incremental Updates +### Workflow 4: Creating Incremental Label Set Versions ```bash -# 1. Create initial dataset +# 1. Create initial labelset pnpm run convert-csv \ --input-file initial-labels.csv \ --output-file my-dataset_0.ensrainbow \ From 2cc8cad606029d1ccd3e7d30225b9a94a70c48f3 Mon Sep 17 00:00:00 2001 From: djstrong Date: Mon, 5 Jan 2026 15:31:23 +0100 Subject: [PATCH 16/30] refactor: rename convert command to convert-sql and update CLI documentation --- apps/ensrainbow/package.json | 2 +- apps/ensrainbow/src/cli.test.ts | 22 +-- apps/ensrainbow/src/cli.ts | 73 ++++---- .../src/commands/convert-csv-command.test.ts | 6 +- .../ensrainbow/concepts/creating-files.mdx | 162 +++++++++--------- .../docs/ensrainbow/concepts/data-model.mdx | 4 +- .../ensrainbow/contributing/cli-reference.mdx | 27 ++- .../docs/ensrainbow/contributing/index.mdx | 12 +- .../src/content/docs/ensrainbow/faq.mdx | 21 ++- 9 files changed, 177 insertions(+), 152 deletions(-) diff --git a/apps/ensrainbow/package.json b/apps/ensrainbow/package.json index 024d6f567..704a88cf7 100644 --- a/apps/ensrainbow/package.json +++ b/apps/ensrainbow/package.json @@ -19,7 +19,7 @@ "validate:lite": "tsx src/cli.ts validate --lite", "purge": "tsx src/cli.ts purge", "convert": "tsx src/cli.ts convert", - "convert-csv": "tsx src/cli.ts convert-csv", + "convert-sql": "tsx src/cli.ts convert-sql", "test": "vitest", "test:coverage": "vitest --coverage", "lint": "biome check --write .", diff --git a/apps/ensrainbow/src/cli.test.ts b/apps/ensrainbow/src/cli.test.ts index ff9364a32..596b35663 100644 --- a/apps/ensrainbow/src/cli.test.ts +++ b/apps/ensrainbow/src/cli.test.ts @@ -111,7 +111,7 @@ describe("CLI", () => { expect(() => cli.parse([ - "convert", + "convert-sql", "--input-file", sqlInputFile, "--output-file", @@ -122,7 +122,7 @@ describe("CLI", () => { // Successful convert with args const ingestCli = createCLI({ exitProcess: false }); await ingestCli.parse([ - "convert", + "convert-sql", "--input-file", sqlInputFile, "--output-file", @@ -132,7 +132,7 @@ describe("CLI", () => { "--label-set-version", labelSetVersion.toString(), ]); - //command: pnpm convert --input-file test/fixtures/test_ens_names.sql.gz --output-file test/fixtures/test_ens_names_0.ensrainbow --label-set-id test-ens-names --label-set-version 0 + //command: pnpm convert-sql --input-file test/fixtures/test_ens_names.sql.gz --output-file test/fixtures/test_ens_names_0.ensrainbow --label-set-id test-ens-names --label-set-version 0 //verify that the file is created await expect(stat(ensrainbowOutputFile)).resolves.toBeDefined(); @@ -163,7 +163,7 @@ describe("CLI", () => { expect(() => cli.parse([ - "convert", + "convert-sql", "--input-file", sqlInputFile, "--output-file", @@ -174,7 +174,7 @@ describe("CLI", () => { // Successful convert with args const ingestCli = createCLI({ exitProcess: false }); await ingestCli.parse([ - "convert", + "convert-sql", "--input-file", sqlInputFile, "--output-file", @@ -184,7 +184,7 @@ describe("CLI", () => { "--label-set-version", labelSetVersion.toString(), ]); - //command: pnpm convert --input-file test_ens_names.sql.gz --output-file test_ens_names_0.ensrainbow --label-set-id test-ens-names --label-set-version 0 + //command: pnpm convert-sql --input-file test_ens_names.sql.gz --output-file test_ens_names_0.ensrainbow --label-set-id test-ens-names --label-set-version 0 //verify that the file is created await expect(stat(ensrainbowOutputFile)).resolves.toBeDefined(); @@ -211,7 +211,7 @@ describe("CLI", () => { expect(() => cli.parse([ - "convert", + "convert-sql", "--input-file", sqlInputFile, "--output-file", @@ -222,7 +222,7 @@ describe("CLI", () => { const ingestCli2 = createCLI({ exitProcess: false }); // Successful convert with args await ingestCli2.parse([ - "convert", + "convert-sql", "--input-file", sqlInputFile, "--output-file", @@ -266,7 +266,7 @@ describe("CLI", () => { // Successful convert with label set version 2 const convertCli = createCLI({ exitProcess: false }); await convertCli.parse([ - "convert", + "convert-sql", "--input-file", sqlInputFile, "--output-file", @@ -318,7 +318,7 @@ describe("CLI", () => { // Create second file with different label set id and label set version 0 const convertCli = createCLI({ exitProcess: false }); await convertCli.parse([ - "convert", + "convert-sql", "--input-file", sqlInputFile, "--output-file", @@ -331,7 +331,7 @@ describe("CLI", () => { // Create third file with different label set id and label set version 1 await convertCli.parse([ - "convert", + "convert-sql", "--input-file", sqlInputFile, "--output-file", diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index b010bed15..75dc53587 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -195,92 +195,93 @@ export function createCLI(options: CLIOptions = {}) { ) .command( "convert", - "Convert rainbow tables from SQL dump to ensrainbow format", + "Convert rainbow tables from CSV format to ensrainbow format", (yargs: Argv) => { return yargs .option("input-file", { type: "string", - description: "Path to the gzipped SQL dump file", - default: join(process.cwd(), "ens_names.sql.gz"), + description: "Path to the CSV input file", + demandOption: true, }) .option("output-file", { type: "string", - description: "Path to the output ensrainbow file", + description: "Path to where the resulting ensrainbow file will be output", default: join(process.cwd(), "rainbow-records.ensrainbow"), }) .option("label-set-id", { type: "string", - description: "Label set id for the rainbow record collection", + description: "Label set id for the generated ensrainbow file", demandOption: true, }) .coerce("label-set-id", buildLabelSetId) .option("label-set-version", { type: "number", - description: "Label set version for the rainbow record collection", + description: "Label set version for the generated ensrainbow file", demandOption: true, }) - .coerce("label-set-version", buildLabelSetVersion); + .coerce("label-set-version", buildLabelSetVersion) + .option("progress-interval", { + type: "number", + description: "Number of records to process before logging progress", + default: 50000, + }) + .option("existing-db-path", { + type: "string", + description: + "Path to existing ENSRainbow database to filter out existing labels from the generated ensrainbow file", + }) + .option("silent", { + type: "boolean", + description: "Disable progress bar (useful for scripts)", + default: false, + }); }, - async (argv: ArgumentsCamelCase) => { - await convertCommand({ + async (argv: ArgumentsCamelCase) => { + await convertCsvCommand({ inputFile: argv["input-file"], outputFile: argv["output-file"], labelSetId: argv["label-set-id"], labelSetVersion: argv["label-set-version"], + progressInterval: argv["progress-interval"], + existingDbPath: argv["existing-db-path"], + silent: argv["silent"], }); }, ) .command( - "convert-csv", - "Convert rainbow tables from CSV format to ensrainbow format", + "convert-sql", + "Convert rainbow tables from legacy SQL dump to ensrainbow format", (yargs: Argv) => { return yargs .option("input-file", { type: "string", - description: "Path to the CSV input file", - demandOption: true, + description: "Path to the gzipped SQL dump file", + default: join(process.cwd(), "ens_names.sql.gz"), }) .option("output-file", { type: "string", - description: "Path to where the resulting ensrainbow file will be output", + description: "Path to the output ensrainbow file", default: join(process.cwd(), "rainbow-records.ensrainbow"), }) .option("label-set-id", { type: "string", - description: "Label set id for the generated ensrainbow file", + description: "Label set id for the rainbow record collection", demandOption: true, }) .coerce("label-set-id", buildLabelSetId) .option("label-set-version", { type: "number", - description: "Label set version for the generated ensrainbow file", + description: "Label set version for the rainbow record collection", demandOption: true, }) - .coerce("label-set-version", buildLabelSetVersion) - .option("progress-interval", { - type: "number", - description: "Number of records to process before logging progress", - default: 50000, - }) - .option("existing-db-path", { - type: "string", - description: "Path to existing ENSRainbow database to filter out existing labels from the generated ensrainbow file", - }) - .option("silent", { - type: "boolean", - description: "Disable progress bar (useful for scripts)", - default: false, - }); + .coerce("label-set-version", buildLabelSetVersion); }, - async (argv: ArgumentsCamelCase) => { - await convertCsvCommand({ + async (argv: ArgumentsCamelCase) => { + await convertCommand({ inputFile: argv["input-file"], outputFile: argv["output-file"], labelSetId: argv["label-set-id"], labelSetVersion: argv["label-set-version"], - progressInterval: argv["progress-interval"], - existingDbPath: argv["existing-db-path"], - silent: argv["silent"], }); }, ) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index e45c8712c..685ff6da7 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -197,7 +197,7 @@ describe("convert-csv-command", () => { // Test convert-csv command through CLI await cli.parse([ - "convert-csv", + "convert", "--input-file", inputFile, "--output-file", @@ -373,7 +373,7 @@ describe("convert-csv-command", () => { const cli = createCLI({ exitProcess: false }); await cli.parse([ - "convert-csv", + "convert", "--input-file", inputFile, "--output-file", @@ -394,7 +394,7 @@ describe("convert-csv-command", () => { // Now test CLI with existing database path await cli.parse([ - "convert-csv", + "convert", "--input-file", inputFile, "--output-file", diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx index d914a344d..cb1ac514e 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -41,81 +41,29 @@ For detailed information about the file format structure, see the [Data Model](/ | Method | Input Format | Use Case | Command | |--------|-------------|----------|---------| -| **SQL Conversion** | Gzipped SQL dump (`ens_names.sql.gz`) | Converting legacy ENS Subgraph data | `pnpm run convert` | -| **CSV Conversion** | CSV file (1 or 2 columns) | Building new ENS rainbow tables | `pnpm run convert-csv` | - -### When to Use SQL Conversion - -- Converting existing ENS Subgraph rainbow tables -- Working with the legacy `ens_names.sql.gz` file. -- Migrating from previous ENS data formats +| **CSV Conversion** | CSV file (1 or 2 columns) | Building new ENS rainbow tables | `pnpm run convert` | +| **SQL Conversion** | Gzipped SQL dump (`ens_names.sql.gz`) | Converting legacy ENS Subgraph data | `pnpm run convert-sql` | ### When to Use CSV Conversion - Creating new rainbow tables for ENSRainbow +- Building custom label sets +- Standard data ingestion workflow -## Method 1: Converting from SQL Dumps - -The `convert` command processes gzipped SQL dump files from the ENS Subgraph. - -### Command Syntax - -```bash -pnpm run convert \ - --input-file \ - --output-file \ - --label-set-id \ - --label-set-version -``` - -### Required Parameters - -- `--input-file`: Path to the gzipped SQL dump file -- `--label-set-id`: Identifier for the label set (e.g., `subgraph`, `discovery-a`) -- `--label-set-version`: Version number for the label set (non-negative integer) - -### Optional Parameters - -- `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`) - -### Example: Converting ENS Subgraph Data - -```bash -# Convert main ENS Subgraph data -pnpm run convert \ - --input-file ens_names.sql.gz \ - --output-file subgraph_0.ensrainbow \ - --label-set-id subgraph \ - --label-set-version 0 -``` - -### Example: Converting Test Data - -```bash -# Convert ens-test-env data -pnpm run convert \ - --input-file test/fixtures/ens_test_env_names.sql.gz \ - --output-file ens-test-env_0.ensrainbow \ - --label-set-id ens-test-env \ - --label-set-version 0 -``` - -### How It Works +### When to Use SQL Conversion -1. **Streams** the gzipped SQL file to avoid memory issues -2. **Parses** SQL COPY statements to extract label/labelhash pairs -3. **Validates** each record and skips invalid entries -4. **Writes** protobuf messages with length-delimited encoding -5. **Creates** a header message followed by individual record messages +- Converting existing ENS Subgraph rainbow tables +- Working with the legacy `ens_names.sql.gz` file +- Migrating from previous ENS data formats -## Method 2: Converting from CSV Files +## Method 1: Converting from CSV Files -The `convert-csv` command processes CSV files with flexible column formats. +The `convert` command processes CSV files with flexible column formats. ### Command Syntax ```bash -pnpm run convert-csv \ +pnpm run convert \ --input-file \ --output-file \ --label-set-id \ @@ -168,7 +116,7 @@ The CSV converter includes built-in filtering capabilities to prevent duplicate Use `--existing-db-path` to filter out labels that already exist in an existing ENSRainbow database: ```bash -pnpm run convert-csv \ +pnpm run convert \ --input-file new-labels.csv \ --output-file incremental_1.ensrainbow \ --label-set-id my-dataset \ @@ -200,7 +148,7 @@ Duration: 150ms ```bash # Create test dataset from CSV -pnpm run convert-csv \ +pnpm run convert \ --input-file test-labels.csv \ --output-file test-dataset_0.ensrainbow \ --label-set-id test-dataset \ @@ -212,7 +160,7 @@ pnpm run convert-csv \ ```bash # Create discovery dataset (initially empty) echo "" > empty.csv -pnpm run convert-csv \ +pnpm run convert \ --input-file empty.csv \ --output-file discovery-a_0.ensrainbow \ --label-set-id discovery-a \ @@ -227,7 +175,61 @@ pnpm run convert-csv \ 4. **Computes** or validates labelhashes as needed 5. **Filters** existing labels if `--existing-db-path` is provided 6. **Filters** duplicate labels within the same CSV file -7. **Writes** protobuf messages with the same format as SQL conversion +7. **Writes** protobuf messages with length-delimited encoding + +## Method 2: Converting from SQL Dumps + +The `convert-sql` command processes gzipped SQL dump files from the ENS Subgraph. + +### Command Syntax + +```bash +pnpm run convert-sql \ + --input-file \ + --output-file \ + --label-set-id \ + --label-set-version +``` + +### Required Parameters + +- `--input-file`: Path to the gzipped SQL dump file +- `--label-set-id`: Identifier for the label set (e.g., `subgraph`, `discovery-a`) +- `--label-set-version`: Version number for the label set (non-negative integer) + +### Optional Parameters + +- `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`) + +### Example: Converting ENS Subgraph Data + +```bash +# Convert main ENS Subgraph data +pnpm run convert-sql \ + --input-file ens_names.sql.gz \ + --output-file subgraph_0.ensrainbow \ + --label-set-id subgraph \ + --label-set-version 0 +``` + +### Example: Converting Test Data + +```bash +# Convert ens-test-env data +pnpm run convert-sql \ + --input-file test/fixtures/ens_test_env_names.sql.gz \ + --output-file ens-test-env_0.ensrainbow \ + --label-set-id ens-test-env \ + --label-set-version 0 +``` + +### How It Works + +1. **Streams** the gzipped SQL file to avoid memory issues +2. **Parses** SQL COPY statements to extract label/labelhash pairs +3. **Validates** each record and skips invalid entries +4. **Writes** protobuf messages with length-delimited encoding +5. **Creates** a header message followed by individual record messages ## Common Workflows @@ -235,7 +237,7 @@ pnpm run convert-csv \ ```bash # 1. Convert SQL dump to .ensrainbow -pnpm run convert \ +pnpm run convert-sql \ --input-file ens_names.sql.gz \ --output-file subgraph_0.ensrainbow \ --label-set-id subgraph \ @@ -257,7 +259,7 @@ pnpm run serve --data-dir data-subgraph --port 3223 ```bash # 1. Convert test data -pnpm run convert \ +pnpm run convert-sql \ --input-file test/fixtures/ens_test_env_names.sql.gz \ --output-file ens-test-env_0.ensrainbow \ --label-set-id ens-test-env \ @@ -281,7 +283,7 @@ mylabel2 mylabel3" > custom-labels.csv # 2. Convert to .ensrainbow -pnpm run convert-csv \ +pnpm run convert \ --input-file custom-labels.csv \ --output-file custom_0.ensrainbow \ --label-set-id custom \ @@ -299,7 +301,7 @@ pnpm run serve --data-dir data-custom --port 3223 ```bash # 1. Create initial labelset -pnpm run convert-csv \ +pnpm run convert \ --input-file initial-labels.csv \ --output-file my-dataset_0.ensrainbow \ --label-set-id my-dataset \ @@ -311,7 +313,7 @@ pnpm run ingest-ensrainbow \ --data-dir data-my-dataset # 3. Create incremental update (filtering existing labels) -pnpm run convert-csv \ +pnpm run convert \ --input-file new-labels.csv \ --output-file my-dataset_1.ensrainbow \ --label-set-id my-dataset \ @@ -379,26 +381,26 @@ If you want to create, publish, and distribute your own `.ensrainbow` files, fol ### 1. Create Your Dataset -First, prepare your data in either SQL or CSV (recommended) format, then convert it using the appropriate method: +First, prepare your data in either CSV (recommended) or SQL format, then convert it using the appropriate method: ```bash -# For CSV data -pnpm run convert-csv \ +# For CSV data (recommended) +pnpm run convert \ --input-file my-labels.csv \ --output-file my-dataset_0.ensrainbow \ --label-set-id my-dataset \ --label-set-version 0 # For CSV data with filtering (if you have an existing database) -pnpm run convert-csv \ +pnpm run convert \ --input-file my-labels.csv \ --output-file my-dataset_1.ensrainbow \ --label-set-id my-dataset \ --label-set-version 1 \ --existing-db-path data-my-dataset -# For SQL data -pnpm run convert \ +# For legacy SQL data +pnpm run convert-sql \ --input-file my-data.sql.gz \ --output-file my-dataset_0.ensrainbow \ --label-set-id my-dataset \ @@ -614,7 +616,7 @@ LABEL_SET_ID="my-dataset" NEW_VERSION="1" # Create new .ensrainbow file -pnpm run convert-csv \ +pnpm run convert \ --input-file updated-labels.csv \ --output-file ${LABEL_SET_ID}_${NEW_VERSION}.ensrainbow \ --label-set-id ${LABEL_SET_ID} \ @@ -652,7 +654,7 @@ export ENSRAINBOW_LABELSET_SERVER_URL="https://my-label-set-server.com" # Test downloading prebuilt database ./scripts/download-prebuilt-database.sh 3 my-dataset 0 -# Verify the database works +# Verify the database works by ingesting the downloaded file pnpm run ingest-ensrainbow \ --input-file labelsets/my-dataset_0.ensrainbow \ --data-dir test-data diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx index e1df686d0..64189a07c 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx @@ -108,8 +108,8 @@ subgraph_1.ensrainbow # next version with incremental labelhash-to-label map ENSRainbow provides two methods for creating `.ensrainbow` files from different data sources: -- **SQL Conversion**: Convert legacy ENS Subgraph data (`ens_names.sql.gz`) using `pnpm run convert` -- **CSV Conversion**: Convert custom datasets from CSV files using `pnpm run convert-csv` +- **CSV Conversion**: Convert custom datasets from CSV files using `pnpm run convert` +- **SQL Conversion**: Convert legacy ENS Subgraph data (`ens_names.sql.gz`) using `pnpm run convert-sql` For complete instructions, examples, and workflow guidance, see the [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files) guide. diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/cli-reference.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/cli-reference.mdx index 6326f7b8b..31b8c686a 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/cli-reference.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/cli-reference.mdx @@ -8,26 +8,39 @@ keywords: [ensrainbow, cli] | Command | Purpose | Most useful flags | Example | |---------|---------|-------------------|---------| -| `convert` | Convert legacy `.sql.gz` rainbow tables to `.ensrainbow` format. **This is currently the only way to create new .ensrainbow files.** | `--input-file`, `--output-file`, `--label-set-id`, `--label-set-version` | `pnpm run convert --input-file ens_names.sql.gz --output-file subgraph-0.ensrainbow` | -| `ingest-ensrainbow` | Stream a `.ensrainbow` file into LevelDB | `--input-file`, `--data-dir` | `pnpm run ingest-ensrainbow --input-file subgraph-0.ensrainbow --data-dir ./data` | +| `convert` | Convert CSV files to `.ensrainbow` format. **This is the primary method for creating new .ensrainbow files.** | `--input-file`, `--output-file`, `--label-set-id`, `--label-set-version`, `--existing-db-path`, `--silent` | `pnpm run convert --input-file labels.csv --output-file my-dataset_0.ensrainbow --label-set-id my-dataset --label-set-version 0` | +| `convert-sql` | Convert legacy `.sql.gz` rainbow tables (ENS Subgraph data) to `.ensrainbow` format | `--input-file`, `--output-file`, `--label-set-id`, `--label-set-version` | `pnpm run convert-sql --input-file ens_names.sql.gz --output-file subgraph_0.ensrainbow --label-set-id subgraph --label-set-version 0` | +| `ingest-ensrainbow` | Stream a `.ensrainbow` file into LevelDB | `--input-file`, `--data-dir` | `pnpm run ingest-ensrainbow --input-file my-dataset_0.ensrainbow --data-dir ./data` | | `validate` | Verify DB integrity | `--data-dir`, `--lite` | `pnpm run validate --lite` | | `purge` | Delete all DB files in a directory | `--data-dir` | `pnpm run purge --data-dir ./data` | | `serve` | Launch the HTTP API server | `--data-dir`, `--port` | `pnpm run serve --port 3223` | ## Creating .ensrainbow Files -:::note[Important] -The `convert` command is **the only way** to create new `.ensrainbow` files from scratch. If you need to create custom label sets with your own data, you must use this command to convert from PostgreSQL dump format. +### CSV Conversion (Recommended) -You can download existing `.ensrainbow` files using the download scripts, but for creating entirely new files, `convert` is your only option. -::: +The `convert` command is the **primary method** for creating new `.ensrainbow` files from CSV data. **Full convert command syntax:** ```bash pnpm run convert \ - --input-file path/to/your_data.sql.gz \ + --input-file path/to/labels.csv \ --output-file path/to/output.ensrainbow \ --label-set-id your-label-set-id \ + --label-set-version 0 \ + [--existing-db-path path/to/existing/database] \ + [--silent] +``` + +### SQL Conversion (Legacy) + +For converting legacy ENS Subgraph data from SQL dumps: + +```bash +pnpm run convert-sql \ + --input-file path/to/ens_names.sql.gz \ + --output-file path/to/output.ensrainbow \ + --label-set-id subgraph \ --label-set-version 0 ``` diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx index 401a0f986..03d213258 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx @@ -115,7 +115,7 @@ To ingest the test data into ENSRainbow: If you don't have a pre-converted `ens-test-env-0.ensrainbow` file: ```bash # Navigate to apps/ensrainbow or adjust paths accordingly - pnpm run convert --input-file test/fixtures/ens_test_env_names.sql.gz --output-file ens-test-env-0.ensrainbow + pnpm run convert-sql --input-file test/fixtures/ens_test_env_names.sql.gz --output-file ens-test-env-0.ensrainbow --label-set-id ens-test-env --label-set-version 0 ``` This creates `ens-test-env-0.ensrainbow`. @@ -274,21 +274,21 @@ This section covers the conversion of source data (like SQL dumps or empty files This command converts a SQL dump file (`ens_names.sql.gz`) into an `.ensrainbow` file for version 0 of the `subgraph` Label Set. ```bash # Assuming ens_names.sql.gz contains the primary dataset -time pnpm run convert --input-file ens_names.sql.gz --output-file subgraph_0.ensrainbow --label-set-id subgraph --label-set-version 0 +time pnpm run convert-sql --input-file ens_names.sql.gz --output-file subgraph_0.ensrainbow --label-set-id subgraph --label-set-version 0 ``` **For the `discovery-a` Label Set (initially empty for discovered labels):** This creates an empty `.ensrainbow` file for version 0 of the `discovery-a` Label Set, which is used for labels discovered dynamically. ```bash -touch empty.sql -gzip empty.sql -time pnpm run convert --input-file empty.sql.gz --output-file discovery-a_0.ensrainbow --label-set-id discovery-a --label-set-version 0 +# Create empty CSV file for discovery dataset +echo "" > empty.csv +time pnpm run convert --input-file empty.csv --output-file discovery-a_0.ensrainbow --label-set-id discovery-a --label-set-version 0 ``` **For the `ens-test-env` Label Set (for testing):** This converts a test dataset SQL dump into an `.ensrainbow` file for version 0 of the `ens-test-env` Label Set. ```bash -time pnpm run convert --input-file test/fixtures/ens_test_env_names.sql.gz --output-file ens-test-env_0.ensrainbow --label-set-id ens-test-env --label-set-version 0 +time pnpm run convert-sql --input-file test/fixtures/ens_test_env_names.sql.gz --output-file ens-test-env_0.ensrainbow --label-set-id ens-test-env --label-set-version 0 ``` ### 2. Upload `.ensrainbow` Files to R2 Storage diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/faq.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/faq.mdx index fa0d5704b..5262fd4c1 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/faq.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/faq.mdx @@ -22,18 +22,27 @@ To stay informed about new versions, monitor the [Available Label Sets](/ensrain ## How can I create my own .ensrainbow file? -Currently, the `convert` command is the **only way** to create new `.ensrainbow` files from scratch. This command converts PostgreSQL rainbow table dumps (`.sql.gz` format) into the binary protobuf format that ENSRainbow uses. +ENSRainbow provides two methods for creating `.ensrainbow` files: -**To create a custom .ensrainbow file:** +**Method 1: CSV Conversion (Recommended)** -1. **Prepare your data** as a PostgreSQL dump file (`.sql.gz`) with ENS labels and labelhashes +The `convert` command is the **primary method** for creating new `.ensrainbow` files from CSV data: + +1. **Prepare your data** as a CSV file with labels (1 column) or labels and labelhashes (2 columns) 2. **Run the convert command:** ```bash - pnpm run convert --input-file your_data.sql.gz --output-file custom.ensrainbow + pnpm run convert --input-file your_labels.csv --output-file custom.ensrainbow --label-set-id my-dataset --label-set-version 0 ``` -3. **Specify the label set details** using `--label-set-id` and `--label-set-version` flags -**Note:** You can download existing `.ensrainbow` files using the download scripts, but for creating entirely new files with your own data, the `convert` command is currently the only option available. +**Method 2: SQL Conversion (Legacy)** + +For converting legacy ENS Subgraph data from PostgreSQL dumps: + +```bash +pnpm run convert-sql --input-file ens_names.sql.gz --output-file custom.ensrainbow --label-set-id subgraph --label-set-version 0 +``` + +**Note:** You can also download existing `.ensrainbow` files using the download scripts. See the [CLI Reference](/ensrainbow/contributing/cli-reference/) for detailed command usage. From bbc2786e75bd462f51899cc38a5c08dc405fefe0 Mon Sep 17 00:00:00 2001 From: djstrong Date: Mon, 5 Jan 2026 15:39:30 +0100 Subject: [PATCH 17/30] refactor: update CLI documentation for output file and label set descriptions --- apps/ensrainbow/src/cli.ts | 6 +++--- .../src/content/docs/ensrainbow/concepts/creating-files.mdx | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index 75dc53587..07e4a46dd 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -260,18 +260,18 @@ export function createCLI(options: CLIOptions = {}) { }) .option("output-file", { type: "string", - description: "Path to the output ensrainbow file", + description: "Path to where the resulting ensrainbow file will be output", default: join(process.cwd(), "rainbow-records.ensrainbow"), }) .option("label-set-id", { type: "string", - description: "Label set id for the rainbow record collection", + description: "Label set id for the generated ensrainbow file", demandOption: true, }) .coerce("label-set-id", buildLabelSetId) .option("label-set-version", { type: "number", - description: "Label set version for the rainbow record collection", + description: "Label set version for the generated ensrainbow file", demandOption: true, }) .coerce("label-set-version", buildLabelSetVersion); diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx index cb1ac514e..9d3655397 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -47,8 +47,6 @@ For detailed information about the file format structure, see the [Data Model](/ ### When to Use CSV Conversion - Creating new rainbow tables for ENSRainbow -- Building custom label sets -- Standard data ingestion workflow ### When to Use SQL Conversion @@ -83,7 +81,7 @@ pnpm run convert \ - `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`) - `--progress-interval`: Progress logging frequency (default: 50000 records) -- `--existing-db-path`: Path to existing ENSRainbow database to filter out existing labels +- `--existing-db-path`: Path to existing ENSRainbow database to filter out existing labels from the generated ensrainbow file - `--silent`: Disable progress bar (useful for scripts and automated workflows) ### CSV Format Support From af4b04175fec4d022fb0f9b693f2b52b18b36ee2 Mon Sep 17 00:00:00 2001 From: djstrong Date: Mon, 5 Jan 2026 15:54:36 +0100 Subject: [PATCH 18/30] docs: enhance SQL conversion section with repository link for legacy data files --- .../src/content/docs/ensrainbow/concepts/creating-files.mdx | 4 +--- .../src/content/docs/ensrainbow/concepts/data-model.mdx | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx index 9d3655397..eedb4487c 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -50,9 +50,7 @@ For detailed information about the file format structure, see the [Data Model](/ ### When to Use SQL Conversion -- Converting existing ENS Subgraph rainbow tables -- Working with the legacy `ens_names.sql.gz` file -- Migrating from previous ENS data formats +- Working with the legacy `ens_names.sql.gz` file. These legacy data files can be obtained from [The Graph's ENS Rainbow repository](https://github.com/graphprotocol/ens-rainbow). ## Method 1: Converting from CSV Files diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx index 64189a07c..dc5d4beaf 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/data-model.mdx @@ -109,7 +109,7 @@ subgraph_1.ensrainbow # next version with incremental labelhash-to-label map ENSRainbow provides two methods for creating `.ensrainbow` files from different data sources: - **CSV Conversion**: Convert custom datasets from CSV files using `pnpm run convert` -- **SQL Conversion**: Convert legacy ENS Subgraph data (`ens_names.sql.gz`) using `pnpm run convert-sql` +- **SQL Conversion**: Convert legacy ENS Subgraph data (`ens_names.sql.gz`) using `pnpm run convert-sql`. These legacy data files can be obtained from [The Graph's ENS Rainbow repository](https://github.com/graphprotocol/ens-rainbow). For complete instructions, examples, and workflow guidance, see the [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files) guide. From 0ebee692bb432cfecdff643d8601a9bddde2b31e Mon Sep 17 00:00:00 2001 From: djstrong Date: Mon, 5 Jan 2026 17:38:40 +0100 Subject: [PATCH 19/30] refactor: update CLI to make output-file optional and enhance documentation for file naming conventions --- apps/ensrainbow/src/cli.ts | 34 +++++++------- .../ensrainbow/concepts/creating-files.mdx | 45 ++++++++++--------- 2 files changed, 43 insertions(+), 36 deletions(-) diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index 07e4a46dd..f6368b78d 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -57,14 +57,14 @@ interface PurgeArgs { interface ConvertArgs { "input-file": string; - "output-file": string; + "output-file"?: string; "label-set-id": LabelSetId; "label-set-version": LabelSetVersion; } interface ConvertCsvArgs { "input-file": string; - "output-file": string; + "output-file"?: string; "label-set-id": LabelSetId; "label-set-version": LabelSetVersion; "progress-interval"?: number; @@ -203,11 +203,6 @@ export function createCLI(options: CLIOptions = {}) { description: "Path to the CSV input file", demandOption: true, }) - .option("output-file", { - type: "string", - description: "Path to where the resulting ensrainbow file will be output", - default: join(process.cwd(), "rainbow-records.ensrainbow"), - }) .option("label-set-id", { type: "string", description: "Label set id for the generated ensrainbow file", @@ -220,6 +215,10 @@ export function createCLI(options: CLIOptions = {}) { demandOption: true, }) .coerce("label-set-version", buildLabelSetVersion) + .option("output-file", { + type: "string", + description: "Path to where the resulting ensrainbow file will be output", + }) .option("progress-interval", { type: "number", description: "Number of records to process before logging progress", @@ -237,9 +236,12 @@ export function createCLI(options: CLIOptions = {}) { }); }, async (argv: ArgumentsCamelCase) => { + const outputFile = + argv["output-file"] ?? + join(process.cwd(), `${argv["label-set-id"]}_${argv["label-set-version"]}.ensrainbow`); await convertCsvCommand({ inputFile: argv["input-file"], - outputFile: argv["output-file"], + outputFile, labelSetId: argv["label-set-id"], labelSetVersion: argv["label-set-version"], progressInterval: argv["progress-interval"], @@ -258,11 +260,6 @@ export function createCLI(options: CLIOptions = {}) { description: "Path to the gzipped SQL dump file", default: join(process.cwd(), "ens_names.sql.gz"), }) - .option("output-file", { - type: "string", - description: "Path to where the resulting ensrainbow file will be output", - default: join(process.cwd(), "rainbow-records.ensrainbow"), - }) .option("label-set-id", { type: "string", description: "Label set id for the generated ensrainbow file", @@ -274,12 +271,19 @@ export function createCLI(options: CLIOptions = {}) { description: "Label set version for the generated ensrainbow file", demandOption: true, }) - .coerce("label-set-version", buildLabelSetVersion); + .coerce("label-set-version", buildLabelSetVersion) + .option("output-file", { + type: "string", + description: "Path to where the resulting ensrainbow file will be output", + }); }, async (argv: ArgumentsCamelCase) => { + const outputFile = + argv["output-file"] ?? + join(process.cwd(), `${argv["label-set-id"]}_${argv["label-set-version"]}.ensrainbow`); await convertCommand({ inputFile: argv["input-file"], - outputFile: argv["output-file"], + outputFile, labelSetId: argv["label-set-id"], labelSetVersion: argv["label-set-version"], }); diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx index eedb4487c..1d8d61621 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -50,7 +50,8 @@ For detailed information about the file format structure, see the [Data Model](/ ### When to Use SQL Conversion -- Working with the legacy `ens_names.sql.gz` file. These legacy data files can be obtained from [The Graph's ENS Rainbow repository](https://github.com/graphprotocol/ens-rainbow). +- **Legacy migration only**: Converting existing `ens_names.sql.gz` file from the legacy ENS Subgraph. This file can be obtained from [The Graph's ENS Rainbow repository](https://github.com/graphprotocol/ens-rainbow). +- **Note**: We recommend using CSV conversion for all new label sets. The SQL conversion method exists primarily for migrating away from legacy subgraph data, not for creating new subgraph-based label sets. ## Method 1: Converting from CSV Files @@ -72,12 +73,12 @@ pnpm run convert \ ### Required Parameters - `--input-file`: Path to the CSV file -- `--label-set-id`: Identifier for the label set -- `--label-set-version`: Version number for the label set +- `--label-set-id`: Identifier for the output `.ensrainbow` file that will be created (used in file naming and metadata) +- `--label-set-version`: Version number for the output `.ensrainbow` file that will be created (used in file naming and metadata) ### Optional Parameters -- `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`) +- `--output-file`: Output file path (defaults to `rainbow-records-{label-set-id}_{label-set-version}.ensrainbow`) - `--progress-interval`: Progress logging frequency (default: 50000 records) - `--existing-db-path`: Path to existing ENSRainbow database to filter out existing labels from the generated ensrainbow file - `--silent`: Disable progress bar (useful for scripts and automated workflows) @@ -175,7 +176,9 @@ pnpm run convert \ ## Method 2: Converting from SQL Dumps -The `convert-sql` command processes gzipped SQL dump files from the ENS Subgraph. +:::warning[Legacy Method] +The `convert-sql` command processes gzipped SQL dump file from the legacy ENS Subgraph. This method exists for migrating away from legacy subgraph data. **For all new label sets, we strongly recommend using CSV conversion (Method 1) instead.** +::: ### Command Syntax @@ -190,17 +193,21 @@ pnpm run convert-sql \ ### Required Parameters - `--input-file`: Path to the gzipped SQL dump file -- `--label-set-id`: Identifier for the label set (e.g., `subgraph`, `discovery-a`) -- `--label-set-version`: Version number for the label set (non-negative integer) +- `--label-set-id`: Identifier for the output `.ensrainbow` file that will be created (used in file naming and metadata, e.g., `subgraph`) +- `--label-set-version`: Version number for the output `.ensrainbow` file that will be created (used in file naming and metadata, non-negative integer) ### Optional Parameters -- `--output-file`: Output file path (defaults to `rainbow-records.ensrainbow`) +- `--output-file`: Output file path (defaults to `rainbow-records-{label-set-id}_{label-set-version}.ensrainbow`) + +### Example: Converting Legacy ENS Subgraph Data -### Example: Converting ENS Subgraph Data +:::note[Legacy Migration Only] +This example shows how to convert existing legacy subgraph data. For new label sets, use CSV conversion instead. +::: ```bash -# Convert main ENS Subgraph data +# Convert legacy ENS Subgraph data (migration use case only) pnpm run convert-sql \ --input-file ens_names.sql.gz \ --output-file subgraph_0.ensrainbow \ @@ -229,10 +236,14 @@ pnpm run convert-sql \ ## Common Workflows -### Workflow 1: Migrating from ENS Subgraph +### Workflow 1: Migrating from Legacy ENS Subgraph + +:::warning[Legacy Migration Only] +This workflow is for migrating away from legacy ENS Subgraph data. For creating new label sets, use CSV conversion (see Workflow 3) instead. +::: ```bash -# 1. Convert SQL dump to .ensrainbow +# 1. Convert legacy SQL dump to .ensrainbow pnpm run convert-sql \ --input-file ens_names.sql.gz \ --output-file subgraph_0.ensrainbow \ @@ -356,8 +367,7 @@ ENSRainbow download scripts save files to specific subdirectories: Follow the naming convention: `{label-set-id}_{label-set-version}.ensrainbow` **Examples:** -- `subgraph_0.ensrainbow` - Main ENS data, version 0 -- `subgraph_1.ensrainbow` - Main ENS data, version 1 (incremental update) +- `subgraph_0.ensrainbow` - Legacy ENS data, version 0 - `discovery-a_0.ensrainbow` - Discovery dataset, version 0 - `ens-test-env_0.ensrainbow` - Test environment data, version 0 @@ -395,13 +405,6 @@ pnpm run convert \ --label-set-version 1 \ --existing-db-path data-my-dataset -# For legacy SQL data -pnpm run convert-sql \ - --input-file my-data.sql.gz \ - --output-file my-dataset_0.ensrainbow \ - --label-set-id my-dataset \ - --label-set-version 0 -``` ### 2. Validate Your File From 9cdbb39a1184fe7e7489c8d96ead6f8fce12d740 Mon Sep 17 00:00:00 2001 From: djstrong Date: Mon, 5 Jan 2026 18:15:06 +0100 Subject: [PATCH 20/30] fix: enforce existing database path requirement in CLI and improve error handling for database access --- apps/ensrainbow/src/cli.ts | 15 ++++++++++++++- .../src/commands/convert-csv-command.test.ts | 11 +++-------- .../src/commands/convert-csv-command.ts | 9 +++++++-- .../docs/ensrainbow/concepts/creating-files.mdx | 5 ++--- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index f6368b78d..849057647 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -227,12 +227,25 @@ export function createCLI(options: CLIOptions = {}) { .option("existing-db-path", { type: "string", description: - "Path to existing ENSRainbow database to filter out existing labels from the generated ensrainbow file", + "Path to existing ENSRainbow database to filter out existing labels from the generated ensrainbow file (required when --label-set-version > 0)", }) .option("silent", { type: "boolean", description: "Disable progress bar (useful for scripts)", default: false, + }) + .check((argv) => { + const labelSetVersion = argv["label-set-version"]; + if ( + labelSetVersion !== undefined && + labelSetVersion > 0 && + !argv["existing-db-path"] + ) { + throw new Error( + "--existing-db-path is required when --label-set-version is greater than 0", + ); + } + return true; }); }, async (argv: ArgumentsCamelCase) => { diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index 685ff6da7..39b3cafbc 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -341,12 +341,12 @@ describe("convert-csv-command", () => { await db.close(); }); - it("should handle non-existent database path gracefully", async () => { + it("should throw error when existing database path cannot be opened", async () => { const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv"); const outputFile = join(tempDir, "output_no_db.ensrainbow"); const nonExistentDbPath = join(tempDir, "non-existent-db"); - // Should not throw error even with non-existent database path + // Should throw error when database path is provided but cannot be opened await expect( convertCsvCommand({ inputFile, @@ -355,12 +355,7 @@ describe("convert-csv-command", () => { labelSetVersion: 0 as LabelSetVersion, existingDbPath: nonExistentDbPath, }), - ).resolves.not.toThrow(); - - // Verify the output file was still created - const stats = await stat(outputFile); - expect(stats.isFile()).toBe(true); - expect(stats.size).toBeGreaterThan(0); + ).rejects.toThrow("Cannot proceed without existing database"); }); it("should work through CLI with existing database path", async () => { diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 87995971e..dc9020e44 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -242,8 +242,13 @@ async function initializeConversion(options: ConvertCsvCommandOptions) { existingDb = await ENSRainbowDB.open(options.existingDbPath); logger.info("Successfully opened existing database for label filtering"); } catch (error) { - logger.warn(`Failed to open existing database at ${options.existingDbPath}: ${error}`); - logger.warn("Proceeding without filtering existing labels"); + const errorMessage = error instanceof Error ? error.message : String(error); + logger.error( + `Failed to open existing database at ${options.existingDbPath}: ${errorMessage}`, + ); + throw new Error( + `Cannot proceed without existing database. Failed to open database at ${options.existingDbPath}: ${errorMessage}`, + ); } } diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx index 1d8d61621..7b8b6b2f3 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -172,7 +172,7 @@ pnpm run convert \ 4. **Computes** or validates labelhashes as needed 5. **Filters** existing labels if `--existing-db-path` is provided 6. **Filters** duplicate labels within the same CSV file -7. **Writes** protobuf messages with length-delimited encoding +7. **Writes** .ensrainbow file as output ## Method 2: Converting from SQL Dumps @@ -231,8 +231,7 @@ pnpm run convert-sql \ 1. **Streams** the gzipped SQL file to avoid memory issues 2. **Parses** SQL COPY statements to extract label/labelhash pairs 3. **Validates** each record and skips invalid entries -4. **Writes** protobuf messages with length-delimited encoding -5. **Creates** a header message followed by individual record messages +4. **Writes** .ensrainbow file as output ## Common Workflows From a7fd4f3beccc3a4d176e8353bea4f029442fb554 Mon Sep 17 00:00:00 2001 From: djstrong Date: Mon, 5 Jan 2026 18:26:30 +0100 Subject: [PATCH 21/30] refactor: update createRainbowRecord function to use RainbowRecord type and improve labelhash handling --- .../src/commands/convert-csv-command.ts | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index dc9020e44..4d49403c5 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -21,6 +21,7 @@ import { CURRENT_ENSRAINBOW_FILE_FORMAT_VERSION, createRainbowProtobufRoot, } from "../utils/protobuf-schema.js"; +import type { RainbowRecord } from "../utils/rainbow-record.js"; /** * Estimate memory usage of a Map (rough approximation) @@ -270,18 +271,18 @@ async function initializeConversion(options: ConvertCsvCommandOptions) { /** * Create rainbow record from parsed CSV row */ -function createRainbowRecord(row: string[]): { labelhash: Buffer; label: string } { +function createRainbowRecord(row: string[]): RainbowRecord { const label = String(row[0]); if (row.length === 1) { // Single column: compute labelhash using labelhash function const labelHashBytes = labelHashToBytes(labelhash(label)); return { - labelhash: Buffer.from(labelHashBytes), + labelHash: labelHashBytes, label: label, }; } else { - // Two columns: validate and use provided hash + // Two columns: validate labelhash format and use provided hash // Trim whitespace from hash (metadata), but preserve label as-is const providedHash = String(row[1]).trim(); if (providedHash === "") { @@ -291,7 +292,7 @@ function createRainbowRecord(row: string[]): { labelhash: Buffer; label: string try { const labelHash = labelHashToBytes(maybeLabelHash as LabelHash); return { - labelhash: Buffer.from(labelHash), + labelHash: labelHash, label: label, }; } catch (error) { @@ -323,7 +324,7 @@ async function processRecord( const rainbowRecord = createRainbowRecord(row); const label = rainbowRecord.label; - const labelHashBytes = rainbowRecord.labelhash; + const labelHashBytes = Buffer.from(rainbowRecord.labelHash); // Check if labelhash already exists in the existing database if (existingDb) { @@ -345,7 +346,11 @@ async function processRecord( await dedupDb.add(label, ""); // Create protobuf message and write with backpressure handling - const recordMessage = RainbowRecordType.fromObject(rainbowRecord); + // Map RainbowRecord (labelHash) to protobuf format (labelhash) + const recordMessage = RainbowRecordType.fromObject({ + labelhash: Buffer.from(rainbowRecord.labelHash), + label: rainbowRecord.label, + }); const buffer = Buffer.from(RainbowRecordType.encodeDelimited(recordMessage).finish()); // Check if write returns false (buffer full) - if so, wait for drain From aac678950ab8686f6280e10bdd5b57b509e02bc8 Mon Sep 17 00:00:00 2001 From: djstrong Date: Tue, 6 Jan 2026 01:00:13 +0100 Subject: [PATCH 22/30] refactor: remove label set version requirement from CLI and enhance output file handling --- apps/ensrainbow/src/cli.ts | 31 +------ .../src/commands/convert-csv-command.test.ts | 50 ++--------- .../src/commands/convert-csv-command.ts | 90 +++++++++++++++---- 3 files changed, 83 insertions(+), 88 deletions(-) diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index 849057647..cb4c18b23 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -66,7 +66,6 @@ interface ConvertCsvArgs { "input-file": string; "output-file"?: string; "label-set-id": LabelSetId; - "label-set-version": LabelSetVersion; "progress-interval"?: number; "existing-db-path"?: string; silent?: boolean; @@ -209,15 +208,10 @@ export function createCLI(options: CLIOptions = {}) { demandOption: true, }) .coerce("label-set-id", buildLabelSetId) - .option("label-set-version", { - type: "number", - description: "Label set version for the generated ensrainbow file", - demandOption: true, - }) - .coerce("label-set-version", buildLabelSetVersion) .option("output-file", { type: "string", - description: "Path to where the resulting ensrainbow file will be output", + description: + "Path to where the resulting ensrainbow file will be output (if not provided, will be generated automatically)", }) .option("progress-interval", { type: "number", @@ -227,36 +221,19 @@ export function createCLI(options: CLIOptions = {}) { .option("existing-db-path", { type: "string", description: - "Path to existing ENSRainbow database to filter out existing labels from the generated ensrainbow file (required when --label-set-version > 0)", + "Path to existing ENSRainbow database to filter out existing labels and determine the next label set version (if not provided, version will be 0)", }) .option("silent", { type: "boolean", description: "Disable progress bar (useful for scripts)", default: false, - }) - .check((argv) => { - const labelSetVersion = argv["label-set-version"]; - if ( - labelSetVersion !== undefined && - labelSetVersion > 0 && - !argv["existing-db-path"] - ) { - throw new Error( - "--existing-db-path is required when --label-set-version is greater than 0", - ); - } - return true; }); }, async (argv: ArgumentsCamelCase) => { - const outputFile = - argv["output-file"] ?? - join(process.cwd(), `${argv["label-set-id"]}_${argv["label-set-version"]}.ensrainbow`); await convertCsvCommand({ inputFile: argv["input-file"], - outputFile, + outputFile: argv["output-file"], labelSetId: argv["label-set-id"], - labelSetVersion: argv["label-set-version"], progressInterval: argv["progress-interval"], existingDbPath: argv["existing-db-path"], silent: argv["silent"], diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index 39b3cafbc..7646f5a77 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -5,7 +5,7 @@ import { join } from "path"; import { labelhash } from "viem"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -import { type LabelSetId, type LabelSetVersion, labelHashToBytes } from "@ensnode/ensnode-sdk"; +import { type LabelSetId, labelHashToBytes } from "@ensnode/ensnode-sdk"; import { createCLI } from "@/cli"; import { ENSRainbowDB } from "@/lib/database"; @@ -40,7 +40,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-csv-one-col" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }); @@ -74,7 +73,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-csv-two-col" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }); @@ -108,7 +106,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-csv-invalid" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, }), ).rejects.toThrow(/Failed on line 1: Invalid labelHash/); }); @@ -123,7 +120,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-csv-special" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }); @@ -165,7 +161,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-csv-invalid-hash" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, }), ).rejects.toThrow(/Failed on line 2: Invalid labelHash/); }); @@ -181,7 +176,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-missing" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, }), ).rejects.toThrow(); }); @@ -204,8 +198,6 @@ describe("convert-csv-command", () => { outputFile, "--label-set-id", "test-cli-csv", - "--label-set-version", - "0", ]); // Verify file was created @@ -234,7 +226,6 @@ describe("convert-csv-command", () => { inputFile, outputFile: initialOutputFile, labelSetId: "test-filtering" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }); @@ -256,11 +247,11 @@ describe("convert-csv-command", () => { await db.close(); // Now convert the same CSV file again, but with filtering enabled + // This should automatically determine version 1 from the existing database await convertCsvCommand({ inputFile, outputFile, labelSetId: "test-filtering" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, // Use same version as initial existingDbPath: dataDir, silent: true, }); @@ -273,21 +264,12 @@ describe("convert-csv-command", () => { const initialStats = await stat(initialOutputFile); expect(outputStats.size).toBeLessThan(initialStats.size); - // Verify that the filtered file contains fewer records + // Verify that ingesting the filtered file (version 1) into a new database fails + // because new databases require version 0 for initial ingestion const filteredDataDir = join(tempDir, "db_filtered_result"); - await cli.parse([ - "ingest-ensrainbow", - "--input-file", - outputFile, - "--data-dir", - filteredDataDir, - ]); - - const filteredDb = await ENSRainbowDB.open(filteredDataDir); - expect(await filteredDb.validate()).toBe(true); - const filteredCount = await filteredDb.getPrecalculatedRainbowRecordCount(); - expect(filteredCount).toBe(0); // All labels should be filtered out since they already exist - await filteredDb.close(); + await expect( + cli.parse(["ingest-ensrainbow", "--input-file", outputFile, "--data-dir", filteredDataDir]), + ).rejects.toThrow(/Initial ingestion must use a file with label set version 0/); }); it("should filter out duplicate labels within the same conversion", async () => { @@ -303,7 +285,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-duplicates" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }); @@ -352,10 +333,9 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-no-db" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, existingDbPath: nonExistentDbPath, }), - ).rejects.toThrow("Cannot proceed without existing database"); + ).rejects.toThrow(/Database is not open/); }); it("should work through CLI with existing database path", async () => { @@ -375,8 +355,6 @@ describe("convert-csv-command", () => { initialOutputFile, "--label-set-id", "test-cli-filtering", - "--label-set-version", - "0", ]); await cli.parse([ @@ -396,8 +374,6 @@ describe("convert-csv-command", () => { outputFile, "--label-set-id", "test-cli-filtering", - "--label-set-version", - "1", "--existing-db-path", dataDir, ]); @@ -429,7 +405,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-small" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }); @@ -475,7 +450,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-many-labels" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }); @@ -498,7 +472,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-empty" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }), ).resolves.not.toThrow(); @@ -531,7 +504,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-whitespace" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }), ).resolves.not.toThrow(); @@ -555,7 +527,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-header" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }), ).rejects.toThrow(/Invalid labelHash/); @@ -569,7 +540,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-header-valid" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }), ).resolves.not.toThrow(); @@ -600,7 +570,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-malformed" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }), ).rejects.toThrow(/Expected \d+ columns/); @@ -619,7 +588,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-malformed2" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }), ).rejects.toThrow(/Expected \d+ columns/); @@ -638,7 +606,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-quoted" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }), ).resolves.not.toThrow(); @@ -683,7 +650,6 @@ describe("convert-csv-command", () => { inputFile, outputFile, labelSetId: "test-empty-hash" as LabelSetId, - labelSetVersion: 0 as LabelSetVersion, silent: true, }), ).rejects.toThrow(/LabelHash cannot be empty/); diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 4d49403c5..e44eb2f9a 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -122,11 +122,10 @@ function setupProgressBar(): ProgressBar { */ export interface ConvertCsvCommandOptions { inputFile: string; - outputFile: string; + outputFile?: string; // Optional - will be generated if not provided labelSetId: string; - labelSetVersion: number; progressInterval?: number; - existingDbPath?: string; // Path to existing ENSRainbow database to check for existing labels + existingDbPath?: string; // Path to existing ENSRainbow database to check for existing labels and determine next version silent?: boolean; // Disable progress bar for tests } @@ -212,15 +211,64 @@ async function checkLabelHashExists(db: ENSRainbowDB, labelHashBytes: Buffer): P } } +/** + * Determine the label set version based on existing database or default to 0 + */ +async function determineLabelSetVersion( + existingDbPath: string | undefined, + labelSetId: string, +): Promise { + if (!existingDbPath) { + return 0; + } + + try { + logger.info(`Opening existing database to determine next label set version: ${existingDbPath}`); + const existingDb = await ENSRainbowDB.open(existingDbPath); + const labelSet = await existingDb.getLabelSet(); + + // Validate that the label set ID matches + if (labelSet.labelSetId !== labelSetId) { + await existingDb.close(); + throw new Error( + `Label set ID mismatch! Database label set id: ${labelSet.labelSetId}, provided label set id: ${labelSetId}`, + ); + } + + const nextVersion = labelSet.highestLabelSetVersion + 1; + await existingDb.close(); + logger.info( + `Determined next label set version: ${nextVersion} (current highest: ${labelSet.highestLabelSetVersion})`, + ); + return nextVersion; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + throw new Error( + `Failed to determine label set version from existing database at ${existingDbPath}: ${errorMessage}`, + ); + } +} + +/** + * Generate output file name from label set ID and version + */ +function generateOutputFileName(labelSetId: string, labelSetVersion: number): string { + return `${labelSetId}_${labelSetVersion}.ensrainbow`; +} + /** * Initialize conversion setup and logging */ -async function initializeConversion(options: ConvertCsvCommandOptions) { +async function initializeConversion( + options: ConvertCsvCommandOptions, + labelSetVersion: number, + outputFile: string, +) { logger.info("Starting conversion from CSV to protobuf format..."); logger.info(`Input file: ${options.inputFile}`); - logger.info(`Output file: ${options.outputFile}`); + logger.info(`Output file: ${outputFile}`); logger.info(`Label set id: ${options.labelSetId}`); - logger.info(`Label set version: ${options.labelSetVersion}`); + logger.info(`Label set version: ${labelSetVersion}`); // Check file size and warn for very large files try { @@ -235,7 +283,7 @@ async function initializeConversion(options: ConvertCsvCommandOptions) { logger.warn(`Could not determine file size: ${error}`); } - // Open existing database if path is provided + // Open existing database if path is provided (for filtering existing labels) let existingDb: ENSRainbowDB | null = null; if (options.existingDbPath) { try { @@ -254,14 +302,9 @@ async function initializeConversion(options: ConvertCsvCommandOptions) { } const { RainbowRecordType, RainbowRecordCollectionType } = createRainbowProtobufRoot(); - const outputStream = setupWriteStream(options.outputFile); + const outputStream = setupWriteStream(outputFile); - writeHeader( - outputStream, - RainbowRecordCollectionType, - options.labelSetId, - options.labelSetVersion, - ); + writeHeader(outputStream, RainbowRecordCollectionType, options.labelSetId, labelSetVersion); logger.info("Reading and processing CSV file line by line with streaming..."); @@ -501,10 +544,15 @@ async function processCSVFile( * Main CSV conversion command with true streaming using fast-csv */ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Promise { - // Validate that existingDbPath is provided when labelSetVersion > 0 - if (options.labelSetVersion > 0 && !options.existingDbPath) { - throw new Error("existingDbPath must be specified if label set version is higher than 0"); - } + // Determine label set version from existing database or default to 0 + const labelSetVersion = await determineLabelSetVersion( + options.existingDbPath, + options.labelSetId, + ); + + // Generate output file name if not provided + const outputFile = + options.outputFile ?? generateOutputFileName(options.labelSetId, labelSetVersion); const stats: ConversionStats = { totalLines: 0, @@ -520,7 +568,11 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom let tempDedupDir: string | null = null; try { - const { RainbowRecordType, outputStream, existingDb: db } = await initializeConversion(options); + const { + RainbowRecordType, + outputStream, + existingDb: db, + } = await initializeConversion(options, labelSetVersion, outputFile); existingDb = db; // Create temporary deduplication database From 35cf39bd7d9fe5b1690a87c9c23cfef66d665ff4 Mon Sep 17 00:00:00 2001 From: djstrong Date: Wed, 7 Jan 2026 13:19:51 +0100 Subject: [PATCH 23/30] docs: update documentation to reflect removal of label set version requirement and clarify CSV conversion process --- .../src/commands/convert-csv-command.ts | 2 +- .../docs/ensrainbow/concepts/architecture.mdx | 11 ++-- .../ensrainbow/concepts/creating-files.mdx | 57 +++++++------------ .../ensrainbow/contributing/cli-reference.mdx | 3 +- .../docs/ensrainbow/contributing/index.mdx | 20 +++---- .../src/content/docs/ensrainbow/faq.mdx | 16 +----- 6 files changed, 43 insertions(+), 66 deletions(-) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index e44eb2f9a..9d1f75497 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -333,7 +333,7 @@ function createRainbowRecord(row: string[]): RainbowRecord { } const maybeLabelHash = providedHash.startsWith("0x") ? providedHash : `0x${providedHash}`; try { - const labelHash = labelHashToBytes(maybeLabelHash as LabelHash); + const labelHash = labelHashToBytes(maybeLabelHash as LabelHash); // performs labelhash format validation return { labelHash: labelHash, label: label, diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/architecture.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/architecture.mdx index b578aca07..8fb49b75a 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/architecture.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/architecture.mdx @@ -10,7 +10,7 @@ import { LinkCard } from '@astrojs/starlight/components'; ENSRainbow consists of four primary layers working together to "heal" unknown labels: -1. **Data Generation & Conversion** – legacy `.sql.gz` rainbow tables are converted to the modern `.ensrainbow` format. +1. **Data Generation & Conversion** – CSV files are converted to the modern `.ensrainbow` format (SQL conversion is available only for migrating legacy ENS Subgraph data). 2. **Data Ingestion** – the `.ensrainbow` files are ingested into a LevelDB database using the `ingest-ensrainbow` CLI. 3. **HTTP API Service** – state in the database is exposed through a lightweight HTTP API. 4. **Client Integration** – applications call the API directly or via the TypeScript SDK. @@ -18,10 +18,13 @@ ENSRainbow consists of four primary layers working together to "heal" unknown la ```mermaid flowchart TD subgraph Data_Generation - SQL[".sql.gz files"] + CSV["CSV files"] + SQL[".sql.gz files
(legacy only)"] ENSRB[".ensrainbow files"] - SQL --> Convert["convert" command] - Convert --> ENSRB + CSV --> ConvertCSV["convert command"] + SQL --> ConvertSQL["convert-sql command
(legacy migration)"] + ConvertCSV --> ENSRB + ConvertSQL --> ENSRB end ENSRB --> Ingest["ingest-ensrainbow"] diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx index 7b8b6b2f3..4af35a0ee 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -64,7 +64,6 @@ pnpm run convert \ --input-file \ --output-file \ --label-set-id \ - --label-set-version \ [--progress-interval ] \ [--existing-db-path ] \ [--silent] @@ -74,7 +73,6 @@ pnpm run convert \ - `--input-file`: Path to the CSV file - `--label-set-id`: Identifier for the output `.ensrainbow` file that will be created (used in file naming and metadata) -- `--label-set-version`: Version number for the output `.ensrainbow` file that will be created (used in file naming and metadata) ### Optional Parameters @@ -103,7 +101,7 @@ vitalik,0xaf2caa1c2ca1d027f1ac823b529d0a67cd144264b2789fa2ea4d63a67c7103cc ens,0x5cee339e13375638553bdf5a6e36ba80fb9f6a4f0783680884d92b558aa471da ``` -The converter validates that provided labelhashes match the computed hash for each label. +The converter validates the format of provided labelhashes (does not verify they match the label). ### Label Filtering @@ -117,7 +115,6 @@ pnpm run convert \ --input-file new-labels.csv \ --output-file incremental_1.ensrainbow \ --label-set-id my-dataset \ - --label-set-version 1 \ --existing-db-path data-my-dataset ``` @@ -149,7 +146,6 @@ pnpm run convert \ --input-file test-labels.csv \ --output-file test-dataset_0.ensrainbow \ --label-set-id test-dataset \ - --label-set-version 0 ``` ### Example: Creating Discovery Dataset @@ -161,7 +157,6 @@ pnpm run convert \ --input-file empty.csv \ --output-file discovery-a_0.ensrainbow \ --label-set-id discovery-a \ - --label-set-version 0 ``` ### How It Works @@ -170,6 +165,9 @@ pnpm run convert \ 2. **Streams** CSV parsing using fast-csv for memory efficiency 3. **Validates** column count and data format 4. **Computes** or validates labelhashes as needed + - For single-column format: Computes labelhash using the `labelhash()` function + - For two-column format: Validates the format of the provided labelhash (does not verify it matches the label) + - Invalid labelhashes are rejected if they don't meet format requirements (66 characters including "0x" prefix, lowercase hex, valid hex format) 5. **Filters** existing labels if `--existing-db-path` is provided 6. **Filters** duplicate labels within the same CSV file 7. **Writes** .ensrainbow file as output @@ -215,22 +213,19 @@ pnpm run convert-sql \ --label-set-version 0 ``` -### Example: Converting Test Data - -```bash -# Convert ens-test-env data -pnpm run convert-sql \ - --input-file test/fixtures/ens_test_env_names.sql.gz \ - --output-file ens-test-env_0.ensrainbow \ - --label-set-id ens-test-env \ - --label-set-version 0 -``` ### How It Works 1. **Streams** the gzipped SQL file to avoid memory issues 2. **Parses** SQL COPY statements to extract label/labelhash pairs 3. **Validates** each record and skips invalid entries + - **Invalid line format**: Lines that don't contain exactly 2 tab-separated columns (labelHash and label) + - **Invalid labelHash format**: LabelHash values that: + - Don't have exactly 66 characters (must be "0x" prefix + 64 hex digits) + - Are not in lowercase (must be all lowercase hexadecimal) + - Don't start with "0x" prefix + - Contain invalid hexadecimal characters + - Invalid entries are safely skipped as they would be unreachable by the ENS Subgraph 4. **Writes** .ensrainbow file as output ## Common Workflows @@ -265,11 +260,10 @@ pnpm run serve --data-dir data-subgraph --port 3223 ```bash # 1. Convert test data -pnpm run convert-sql \ - --input-file test/fixtures/ens_test_env_names.sql.gz \ +pnpm run convert \ + --input-file test/fixtures/ens_test_env_names.csv \ --output-file ens-test-env_0.ensrainbow \ - --label-set-id ens-test-env \ - --label-set-version 0 + --label-set-id ens-test-env # 2. Ingest test data pnpm run ingest-ensrainbow \ @@ -292,8 +286,7 @@ mylabel3" > custom-labels.csv pnpm run convert \ --input-file custom-labels.csv \ --output-file custom_0.ensrainbow \ - --label-set-id custom \ - --label-set-version 0 + --label-set-id custom # 3. Ingest and serve pnpm run ingest-ensrainbow \ @@ -310,8 +303,7 @@ pnpm run serve --data-dir data-custom --port 3223 pnpm run convert \ --input-file initial-labels.csv \ --output-file my-dataset_0.ensrainbow \ - --label-set-id my-dataset \ - --label-set-version 0 + --label-set-id my-dataset # 2. Ingest initial data pnpm run ingest-ensrainbow \ @@ -323,7 +315,6 @@ pnpm run convert \ --input-file new-labels.csv \ --output-file my-dataset_1.ensrainbow \ --label-set-id my-dataset \ - --label-set-version 1 \ --existing-db-path data-my-dataset # 4. Ingest incremental update @@ -386,24 +377,21 @@ If you want to create, publish, and distribute your own `.ensrainbow` files, fol ### 1. Create Your Dataset -First, prepare your data in either CSV (recommended) or SQL format, then convert it using the appropriate method: +First, prepare your data in CSV format, then convert it using the `convert` command: ```bash -# For CSV data (recommended) pnpm run convert \ --input-file my-labels.csv \ --output-file my-dataset_0.ensrainbow \ - --label-set-id my-dataset \ - --label-set-version 0 + --label-set-id my-dataset -# For CSV data with filtering (if you have an existing database) +# to create an incremental update, you can use the `--existing-db-path` flag to filter out existing labels: pnpm run convert \ - --input-file my-labels.csv \ + --input-file my-labels2.csv \ --output-file my-dataset_1.ensrainbow \ --label-set-id my-dataset \ - --label-set-version 1 \ --existing-db-path data-my-dataset - +``` ### 2. Validate Your File @@ -617,8 +605,7 @@ NEW_VERSION="1" pnpm run convert \ --input-file updated-labels.csv \ --output-file ${LABEL_SET_ID}_${NEW_VERSION}.ensrainbow \ - --label-set-id ${LABEL_SET_ID} \ - --label-set-version ${NEW_VERSION} + --label-set-id ${LABEL_SET_ID} # Create prebuilt database pnpm run ingest-ensrainbow \ diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/cli-reference.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/cli-reference.mdx index 31b8c686a..58023497a 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/cli-reference.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/cli-reference.mdx @@ -8,7 +8,7 @@ keywords: [ensrainbow, cli] | Command | Purpose | Most useful flags | Example | |---------|---------|-------------------|---------| -| `convert` | Convert CSV files to `.ensrainbow` format. **This is the primary method for creating new .ensrainbow files.** | `--input-file`, `--output-file`, `--label-set-id`, `--label-set-version`, `--existing-db-path`, `--silent` | `pnpm run convert --input-file labels.csv --output-file my-dataset_0.ensrainbow --label-set-id my-dataset --label-set-version 0` | +| `convert` | Convert CSV files to `.ensrainbow` format. **This is the primary method for creating new .ensrainbow files.** | `--input-file`, `--output-file`, `--label-set-id`, `--existing-db-path`, `--silent` | `pnpm run convert --input-file labels.csv --output-file my-dataset_0.ensrainbow --label-set-id my-dataset` | | `convert-sql` | Convert legacy `.sql.gz` rainbow tables (ENS Subgraph data) to `.ensrainbow` format | `--input-file`, `--output-file`, `--label-set-id`, `--label-set-version` | `pnpm run convert-sql --input-file ens_names.sql.gz --output-file subgraph_0.ensrainbow --label-set-id subgraph --label-set-version 0` | | `ingest-ensrainbow` | Stream a `.ensrainbow` file into LevelDB | `--input-file`, `--data-dir` | `pnpm run ingest-ensrainbow --input-file my-dataset_0.ensrainbow --data-dir ./data` | | `validate` | Verify DB integrity | `--data-dir`, `--lite` | `pnpm run validate --lite` | @@ -27,7 +27,6 @@ pnpm run convert \ --input-file path/to/labels.csv \ --output-file path/to/output.ensrainbow \ --label-set-id your-label-set-id \ - --label-set-version 0 \ [--existing-db-path path/to/existing/database] \ [--silent] ``` diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx index 03d213258..984b5d683 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/contributing/index.mdx @@ -105,9 +105,9 @@ Starts the API server. The process will exit with: ## Using ENSRainbow with ens-test-env -The ens-test-env project provides a test environment for ENS development. It includes a small dataset of ENS names in the `ens_test_env_names.sql.gz` file that can be used with ENSRainbow for testing purposes. +The ens-test-env project provides a test environment for ENS development. It includes a small dataset of ENS names in the `ens_test_env_names.csv` file that can be used with ENSRainbow for testing purposes. -### Ingesting ens_test_env_names.sql.gz +### Ingesting ens_test_env_names.csv To ingest the test data into ENSRainbow: @@ -115,7 +115,7 @@ To ingest the test data into ENSRainbow: If you don't have a pre-converted `ens-test-env-0.ensrainbow` file: ```bash # Navigate to apps/ensrainbow or adjust paths accordingly - pnpm run convert-sql --input-file test/fixtures/ens_test_env_names.sql.gz --output-file ens-test-env-0.ensrainbow --label-set-id ens-test-env --label-set-version 0 + pnpm run convert --input-file test/fixtures/ens_test_env_names.csv --output-file ens-test-env-0.ensrainbow --label-set-id ens-test-env ``` This creates `ens-test-env-0.ensrainbow`. @@ -268,12 +268,12 @@ These steps are typically performed by project maintainers for releasing officia ### 1. Prepare `.ensrainbow` Files -This section covers the conversion of source data (like SQL dumps or empty files for initial datasets) into the `.ensrainbow` format. For detailed conversion instructions and examples, see the [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files) guide. +This section covers the conversion of source data into the `.ensrainbow` format. For detailed conversion instructions and examples, see the [Creating ENSRainbow Files](/ensrainbow/concepts/creating-files) guide. -**For the `subgraph` Label Set (main dataset):** -This command converts a SQL dump file (`ens_names.sql.gz`) into an `.ensrainbow` file for version 0 of the `subgraph` Label Set. +**For the `subgraph` Label Set (legacy migration only):** +This command converts a SQL dump file (`ens_names.sql.gz`) from the legacy ENS Subgraph into an `.ensrainbow` file for version 0 of the `subgraph` Label Set. **Note:** SQL conversion is only for migrating legacy ENS Subgraph data. For all new label sets, use CSV conversion instead. ```bash -# Assuming ens_names.sql.gz contains the primary dataset +# Assuming ens_names.sql.gz contains the dataset time pnpm run convert-sql --input-file ens_names.sql.gz --output-file subgraph_0.ensrainbow --label-set-id subgraph --label-set-version 0 ``` @@ -282,13 +282,13 @@ This creates an empty `.ensrainbow` file for version 0 of the `discovery-a` Labe ```bash # Create empty CSV file for discovery dataset echo "" > empty.csv -time pnpm run convert --input-file empty.csv --output-file discovery-a_0.ensrainbow --label-set-id discovery-a --label-set-version 0 +time pnpm run convert --input-file empty.csv --output-file discovery-a_0.ensrainbow --label-set-id discovery-a ``` **For the `ens-test-env` Label Set (for testing):** -This converts a test dataset SQL dump into an `.ensrainbow` file for version 0 of the `ens-test-env` Label Set. +This converts a test dataset CSV file into an `.ensrainbow` file for version 0 of the `ens-test-env` Label Set. ```bash -time pnpm run convert-sql --input-file test/fixtures/ens_test_env_names.sql.gz --output-file ens-test-env_0.ensrainbow --label-set-id ens-test-env --label-set-version 0 +time pnpm run convert --input-file test/fixtures/ens_test_env_names.csv --output-file ens-test-env_0.ensrainbow --label-set-id ens-test-env ``` ### 2. Upload `.ensrainbow` Files to R2 Storage diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/faq.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/faq.mdx index 5262fd4c1..fbf621e7c 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/faq.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/faq.mdx @@ -22,26 +22,14 @@ To stay informed about new versions, monitor the [Available Label Sets](/ensrain ## How can I create my own .ensrainbow file? -ENSRainbow provides two methods for creating `.ensrainbow` files: - -**Method 1: CSV Conversion (Recommended)** - -The `convert` command is the **primary method** for creating new `.ensrainbow` files from CSV data: +You can create your own `.ensrainbow` files from CSV data using the `convert` command, which generates new `.ensrainbow` files from your supplied CSV input. 1. **Prepare your data** as a CSV file with labels (1 column) or labels and labelhashes (2 columns) 2. **Run the convert command:** ```bash - pnpm run convert --input-file your_labels.csv --output-file custom.ensrainbow --label-set-id my-dataset --label-set-version 0 + pnpm run convert --input-file your_labels.csv --output-file custom.ensrainbow --label-set-id my-dataset ``` -**Method 2: SQL Conversion (Legacy)** - -For converting legacy ENS Subgraph data from PostgreSQL dumps: - -```bash -pnpm run convert-sql --input-file ens_names.sql.gz --output-file custom.ensrainbow --label-set-id subgraph --label-set-version 0 -``` - **Note:** You can also download existing `.ensrainbow` files using the download scripts. See the [CLI Reference](/ensrainbow/contributing/cli-reference/) for detailed command usage. From 7125f230a0bb35c31e6f6d3d0e460c3433fc5f20 Mon Sep 17 00:00:00 2001 From: djstrong Date: Wed, 7 Jan 2026 13:25:11 +0100 Subject: [PATCH 24/30] feat: rename convert command for SQL dumps --- apps/ensrainbow/src/cli.ts | 2 +- .../src/commands/{convert-command.ts => convert-command-sql.ts} | 0 .../src/content/docs/ensrainbow/concepts/creating-files.mdx | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename apps/ensrainbow/src/commands/{convert-command.ts => convert-command-sql.ts} (100%) diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index cb4c18b23..35732e4e8 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -12,7 +12,7 @@ import { type LabelSetVersion, } from "@ensnode/ensnode-sdk"; -import { convertCommand } from "@/commands/convert-command"; +import { convertCommand } from "@/commands/convert-command-sql"; import { convertCsvCommand } from "@/commands/convert-csv-command"; // import { ingestCommand } from "@/commands/ingest-command"; import { ingestProtobufCommand } from "@/commands/ingest-protobuf-command"; diff --git a/apps/ensrainbow/src/commands/convert-command.ts b/apps/ensrainbow/src/commands/convert-command-sql.ts similarity index 100% rename from apps/ensrainbow/src/commands/convert-command.ts rename to apps/ensrainbow/src/commands/convert-command-sql.ts diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx index 4af35a0ee..62b036eef 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -78,7 +78,7 @@ pnpm run convert \ - `--output-file`: Output file path (defaults to `rainbow-records-{label-set-id}_{label-set-version}.ensrainbow`) - `--progress-interval`: Progress logging frequency (default: 50000 records) -- `--existing-db-path`: Path to existing ENSRainbow database to filter out existing labels from the generated ensrainbow file +- `--existing-db-path`: Path to existing ENSRainbow database to filter out existing labels from the generated ensrainbow file and determine the next label set version - `--silent`: Disable progress bar (useful for scripts and automated workflows) ### CSV Format Support From f7ca2448c9b6f9e7962f33ee927ce8ebb669ae12 Mon Sep 17 00:00:00 2001 From: djstrong Date: Wed, 7 Jan 2026 13:38:38 +0100 Subject: [PATCH 25/30] refactor: update CSV conversion documentation --- apps/ensrainbow/src/commands/convert-csv-command.test.ts | 2 +- apps/ensrainbow/src/commands/convert-csv-command.ts | 3 +-- .../src/content/docs/ensrainbow/concepts/creating-files.mdx | 4 +++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index 7646f5a77..12015137f 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -513,7 +513,7 @@ describe("convert-csv-command", () => { expect(stats.isFile()).toBe(true); }); - it("should skip CSV header row if present", async () => { + it("should process all CSV rows including potential headers", async () => { const inputFile = join(tempDir, "with_header.csv"); const outputFile = join(tempDir, "output_header.ensrainbow"); const csvContent = diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 9d1f75497..05c1c5e57 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -264,7 +264,7 @@ async function initializeConversion( labelSetVersion: number, outputFile: string, ) { - logger.info("Starting conversion from CSV to protobuf format..."); + logger.info("Starting conversion from CSV to .ensrainbow format..."); logger.info(`Input file: ${options.inputFile}`); logger.info(`Output file: ${outputFile}`); logger.info(`Label set id: ${options.labelSetId}`); @@ -428,7 +428,6 @@ async function processCSVFile( let lineNumber = 0; let processedRecords = 0; let lastLoggedLine = 0; - const startTime = Date.now(); let lastLogTime = Date.now(); const fileStream = createReadStream(inputFile, { encoding: "utf8" }); diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx index 62b036eef..8aec77266 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -83,7 +83,7 @@ pnpm run convert \ ### CSV Format Support -The CSV converter supports two formats: +The CSV converter supports two formats and expects CSV files **without a header row**. #### Single Column Format (Label Only) ```csv @@ -458,6 +458,8 @@ Create documentation for your custom label set including: - Checksum: `https://example.com/my-dataset_0.tgz.sha256sum` ### Usage +``` + ```bash # Using with Docker docker run -d \ From 7ab51653c96fa2b8b4437c9ef48614e0e792ab8b Mon Sep 17 00:00:00 2001 From: djstrong Date: Wed, 7 Jan 2026 14:27:13 +0100 Subject: [PATCH 26/30] test: add error handling test for label set ID mismatch in CSV conversion --- .../src/commands/convert-csv-command.test.ts | 46 +++++++++++++++++++ .../src/commands/convert-csv-command.ts | 21 +++++---- 2 files changed, 58 insertions(+), 9 deletions(-) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index 12015137f..706e22166 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -338,6 +338,52 @@ describe("convert-csv-command", () => { ).rejects.toThrow(/Database is not open/); }); + it("should throw error when label set ID mismatches existing database", async () => { + const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv"); + const outputFile = join(tempDir, "output_mismatch.ensrainbow"); + const dataDir = join(tempDir, "db_mismatch"); + + // First, create a database with one label set ID + const initialOutputFile = join(tempDir, "initial_mismatch.ensrainbow"); + await convertCsvCommand({ + inputFile, + outputFile: initialOutputFile, + labelSetId: "test-label-set-a" as LabelSetId, + silent: true, + }); + + // Ingest the initial file to create the database + const cli = createCLI({ exitProcess: false }); + await cli.parse([ + "ingest-ensrainbow", + "--input-file", + initialOutputFile, + "--data-dir", + dataDir, + ]); + + // Verify initial database was created + const db = await ENSRainbowDB.open(dataDir); + expect(await db.validate()).toBe(true); + const labelSet = await db.getLabelSet(); + expect(labelSet.labelSetId).toBe("test-label-set-a"); + await db.close(); + + // Now try to convert with a different label set ID and the existing database path + // This should throw an error about label set ID mismatch + await expect( + convertCsvCommand({ + inputFile, + outputFile, + labelSetId: "test-label-set-b" as LabelSetId, + existingDbPath: dataDir, + silent: true, + }), + ).rejects.toThrow( + /Label set ID mismatch! Database label set id: test-label-set-a, provided label set id: test-label-set-b/, + ); + }); + it("should work through CLI with existing database path", async () => { const inputFile = join(TEST_FIXTURES_DIR, "test_labels_1col.csv"); const outputFile = join(tempDir, "cli_output_with_db.ensrainbow"); diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 05c1c5e57..77b800df8 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -206,8 +206,11 @@ async function checkLabelHashExists(db: ENSRainbowDB, labelHashBytes: Buffer): P const record = await db.getVersionedRainbowRecord(labelHashBytes); return record !== null; } catch (error) { - // If there's an error checking, assume it doesn't exist - return false; + const errorMessage = error instanceof Error ? error.message : String(error); + logger.error( + `Error while checking if labelhash exists in ENSRainbow database: ${errorMessage}`, + ); + throw error; } } @@ -564,7 +567,7 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom let existingDb: ENSRainbowDB | null = null; let dedupDb: DeduplicationDB | undefined; - let tempDedupDir: string | null = null; + let temporaryDedupDir: string | null = null; try { const { @@ -575,9 +578,9 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom existingDb = db; // Create temporary deduplication database - tempDedupDir = join(process.cwd(), "temp-dedup-" + Date.now()); - logger.info(`Creating temporary deduplication database at: ${tempDedupDir}`); - const tempDb = new ClassicLevel(tempDedupDir, { + temporaryDedupDir = join(process.cwd(), "temp-dedup-" + Date.now()); + logger.info(`Creating temporary deduplication database at: ${temporaryDedupDir}`); + const tempDb = new ClassicLevel(temporaryDedupDir, { keyEncoding: "utf8", valueEncoding: "utf8", createIfMissing: true, @@ -649,10 +652,10 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom } // Remove temporary deduplication database directory - if (tempDedupDir) { + if (temporaryDedupDir) { try { - rmSync(tempDedupDir, { recursive: true, force: true }); - logger.info(`Removed temporary deduplication database: ${tempDedupDir}`); + rmSync(temporaryDedupDir, { recursive: true, force: true }); + logger.info(`Removed temporary deduplication database: ${temporaryDedupDir}`); } catch (error) { logger.warn(`Failed to remove temporary deduplication database: ${error}`); } From 3967d4ce4713818014ce2720fe019ec6aea50985 Mon Sep 17 00:00:00 2001 From: djstrong Date: Wed, 7 Jan 2026 14:44:51 +0100 Subject: [PATCH 27/30] refactor: rename and enhance label set version retrieval function to return database connection --- .../src/commands/convert-csv-command.ts | 42 +++++++------------ 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index 77b800df8..ac981c537 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -215,14 +215,15 @@ async function checkLabelHashExists(db: ENSRainbowDB, labelHashBytes: Buffer): P } /** - * Determine the label set version based on existing database or default to 0 + * Get the label set version and open database connection if needed + * Returns both the version and the open database connection (if opened) to avoid redundant opens */ -async function determineLabelSetVersion( +async function getLabelSetVersionAndDatabase( existingDbPath: string | undefined, labelSetId: string, -): Promise { +): Promise<{ version: number; existingDb: ENSRainbowDB | null }> { if (!existingDbPath) { - return 0; + return { version: 0, existingDb: null }; } try { @@ -239,11 +240,11 @@ async function determineLabelSetVersion( } const nextVersion = labelSet.highestLabelSetVersion + 1; - await existingDb.close(); logger.info( `Determined next label set version: ${nextVersion} (current highest: ${labelSet.highestLabelSetVersion})`, ); - return nextVersion; + // Return the open database connection instead of closing it + return { version: nextVersion, existingDb }; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); throw new Error( @@ -266,6 +267,7 @@ async function initializeConversion( options: ConvertCsvCommandOptions, labelSetVersion: number, outputFile: string, + existingDb: ENSRainbowDB | null, ) { logger.info("Starting conversion from CSV to .ensrainbow format..."); logger.info(`Input file: ${options.inputFile}`); @@ -286,22 +288,9 @@ async function initializeConversion( logger.warn(`Could not determine file size: ${error}`); } - // Open existing database if path is provided (for filtering existing labels) - let existingDb: ENSRainbowDB | null = null; - if (options.existingDbPath) { - try { - logger.info(`Opening existing database for filtering: ${options.existingDbPath}`); - existingDb = await ENSRainbowDB.open(options.existingDbPath); - logger.info("Successfully opened existing database for label filtering"); - } catch (error) { - const errorMessage = error instanceof Error ? error.message : String(error); - logger.error( - `Failed to open existing database at ${options.existingDbPath}: ${errorMessage}`, - ); - throw new Error( - `Cannot proceed without existing database. Failed to open database at ${options.existingDbPath}: ${errorMessage}`, - ); - } + // Log if using existing database for filtering + if (existingDb) { + logger.info("Using existing database connection for label filtering"); } const { RainbowRecordType, RainbowRecordCollectionType } = createRainbowProtobufRoot(); @@ -546,8 +535,9 @@ async function processCSVFile( * Main CSV conversion command with true streaming using fast-csv */ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Promise { - // Determine label set version from existing database or default to 0 - const labelSetVersion = await determineLabelSetVersion( + // Get label set version from existing database or default to 0 + // This also opens the database if needed, and we'll reuse that connection + const { version: labelSetVersion, existingDb: openedDb } = await getLabelSetVersionAndDatabase( options.existingDbPath, options.labelSetId, ); @@ -565,7 +555,7 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom startTime: new Date(), }; - let existingDb: ENSRainbowDB | null = null; + let existingDb: ENSRainbowDB | null = openedDb; let dedupDb: DeduplicationDB | undefined; let temporaryDedupDir: string | null = null; @@ -574,7 +564,7 @@ export async function convertCsvCommand(options: ConvertCsvCommandOptions): Prom RainbowRecordType, outputStream, existingDb: db, - } = await initializeConversion(options, labelSetVersion, outputFile); + } = await initializeConversion(options, labelSetVersion, outputFile, existingDb); existingDb = db; // Create temporary deduplication database From 0a555540411202e6a056f26aaafccc6ff537f280 Mon Sep 17 00:00:00 2001 From: djstrong Date: Thu, 8 Jan 2026 00:42:32 +0100 Subject: [PATCH 28/30] refactor: remove label set version requirement from CLI commands and update related tests --- apps/ensrainbow/src/cli.test.ts | 161 ++++++++++++++---- apps/ensrainbow/src/cli.ts | 19 +-- .../src/commands/convert-csv-command.test.ts | 2 +- .../src/commands/convert-csv-command.ts | 8 +- .../ensrainbow/concepts/creating-files.mdx | 4 +- 5 files changed, 137 insertions(+), 57 deletions(-) diff --git a/apps/ensrainbow/src/cli.test.ts b/apps/ensrainbow/src/cli.test.ts index 596b35663..dedf1b88a 100644 --- a/apps/ensrainbow/src/cli.test.ts +++ b/apps/ensrainbow/src/cli.test.ts @@ -107,7 +107,6 @@ describe("CLI", () => { const ensrainbowFile = join(TEST_FIXTURES_DIR, "test_ens_names_0.ensrainbow"); const ensrainbowOutputFile = join(tempDir, "test_ens_names_0.ensrainbow"); const labelSetId = "test-ens-names"; // Needed for convert - const labelSetVersion = 0; // Needed for convert expect(() => cli.parse([ @@ -117,7 +116,7 @@ describe("CLI", () => { "--output-file", ensrainbowOutputFile, ]), - ).toThrow(/Missing required arguments: label-set-id, label-set-version/); + ).toThrow(/Missing required argument: label-set-id/); // Successful convert with args const ingestCli = createCLI({ exitProcess: false }); @@ -129,8 +128,6 @@ describe("CLI", () => { ensrainbowOutputFile, "--label-set-id", labelSetId, - "--label-set-version", - labelSetVersion.toString(), ]); //command: pnpm convert-sql --input-file test/fixtures/test_ens_names.sql.gz --output-file test/fixtures/test_ens_names_0.ensrainbow --label-set-id test-ens-names --label-set-version 0 //verify that the file is created @@ -159,7 +156,6 @@ describe("CLI", () => { const sqlInputFile = join(TEST_FIXTURES_DIR, "ens_test_env_names.sql.gz"); const ensrainbowOutputFile = join(tempDir, "ens_test_env_0.ensrainbow"); const labelSetId = "ens-test-env"; // Needed for convert - const labelSetVersion = 0; // Needed for convert expect(() => cli.parse([ @@ -169,7 +165,7 @@ describe("CLI", () => { "--output-file", ensrainbowOutputFile, ]), - ).toThrow(/Missing required arguments: label-set-id, label-set-version/); + ).toThrow(/Missing required argument: label-set-id/); // Successful convert with args const ingestCli = createCLI({ exitProcess: false }); @@ -181,8 +177,6 @@ describe("CLI", () => { ensrainbowOutputFile, "--label-set-id", labelSetId, - "--label-set-version", - labelSetVersion.toString(), ]); //command: pnpm convert-sql --input-file test_ens_names.sql.gz --output-file test_ens_names_0.ensrainbow --label-set-id test-ens-names --label-set-version 0 //verify that the file is created @@ -207,7 +201,6 @@ describe("CLI", () => { const sqlInputFile = join(TEST_FIXTURES_DIR, "test_ens_names.sql.gz"); const ensrainbowOutputFile = join(tempDir, "test_ens_names_1.ensrainbow"); const labelSetId = "test-ens-names"; // Needed for convert - const labelSetVersion = 1; // Needed for convert expect(() => cli.parse([ @@ -217,20 +210,47 @@ describe("CLI", () => { "--output-file", ensrainbowOutputFile, ]), - ).toThrow(/Missing required arguments: label-set-id, label-set-version/); + ).toThrow(/Missing required argument: label-set-id/); const ingestCli2 = createCLI({ exitProcess: false }); - // Successful convert with args + // Successful convert with args (convert-sql always creates version 0) + // To test version 1, we need to use convert command with existing database + // But for this test, we'll create version 0 and then manually test the ingestion failure + const csvInputFile = join(TEST_FIXTURES_DIR, "test_labels_2col.csv"); + const tempDbDirForV1 = join(tempDir, "temp-db-for-v1"); + const version0FileForV1 = join(tempDir, "test_ens_names_0_for_v1.ensrainbow"); + + // Create version 0 file await ingestCli2.parse([ - "convert-sql", + "convert", "--input-file", - sqlInputFile, + csvInputFile, + "--output-file", + version0FileForV1, + "--label-set-id", + labelSetId, + ]); + + // Ingest version 0 to create database + await ingestCli2.parse([ + "ingest-ensrainbow", + "--input-file", + version0FileForV1, + "--data-dir", + tempDbDirForV1, + ]); + + // Create version 1 file using existing database + await ingestCli2.parse([ + "convert", + "--input-file", + csvInputFile, "--output-file", ensrainbowOutputFile, "--label-set-id", labelSetId, - "--label-set-version", - labelSetVersion.toString(), + "--existing-db-path", + tempDbDirForV1, ]); //verify it is created await expect(stat(ensrainbowOutputFile)).resolves.toBeDefined(); @@ -254,38 +274,99 @@ describe("CLI", () => { }); it("should ingest first file successfully but reject second file with label set version not being 1 higher than the current highest label set version", async () => { - // First, ingest a valid file with label set version 0 - const firstInputFile = join(TEST_FIXTURES_DIR, "test_ens_names_0.ensrainbow"); + // First, we'll create a version 0 file and then a version 2 file const secondInputFile = join(tempDir, "test_ens_names_2.ensrainbow"); // Create an ensrainbow file with label set version 2 - const sqlInputFile = join(TEST_FIXTURES_DIR, "test_ens_names.sql.gz"); + // To create version 2, we need to create version 0, ingest it, create version 1, ingest it, then create version 2 + const csvInputFile = join(TEST_FIXTURES_DIR, "test_labels_2col.csv"); const labelSetId = "test-ens-names"; - const labelSetVersion = 2; // Higher than 1 - // Successful convert with label set version 2 + // Create temporary directory for building up versions sequentially + const tempDbDir = join(tempDir, "temp-db"); + const version0File = join(tempDir, "test_ens_names_0_temp.ensrainbow"); + const version1File = join(tempDir, "test_ens_names_1_temp.ensrainbow"); + const convertCli = createCLI({ exitProcess: false }); + + // Step 1: Create version 0 file await convertCli.parse([ - "convert-sql", + "convert", "--input-file", - sqlInputFile, + csvInputFile, + "--output-file", + version0File, + "--label-set-id", + labelSetId, + ]); + + // Step 2: Ingest version 0 to create database (database now has version 0) + await convertCli.parse([ + "ingest-ensrainbow", + "--input-file", + version0File, + "--data-dir", + tempDbDir, + ]); + + // Step 3: Create version 1 file using existing database (will be version 1) + await convertCli.parse([ + "convert", + "--input-file", + csvInputFile, + "--output-file", + version1File, + "--label-set-id", + labelSetId, + "--existing-db-path", + tempDbDir, + ]); + + // Step 4: Ingest version 1 into the same database (database now has versions 0 and 1, highest is 1) + await convertCli.parse([ + "ingest-ensrainbow", + "--input-file", + version1File, + "--data-dir", + tempDbDir, + ]); + + // Step 5: Create version 2 file using existing database (will be version 2, since highest is 1) + await convertCli.parse([ + "convert", + "--input-file", + csvInputFile, "--output-file", secondInputFile, "--label-set-id", labelSetId, - "--label-set-version", - labelSetVersion.toString(), + "--existing-db-path", + tempDbDir, ]); // Verify the file with label set version 2 was created await expect(stat(secondInputFile)).resolves.toBeDefined(); + // Create a completely separate version 0 file for the final test + // Use a fresh CLI instance and ensure no existing-db-path is used + const finalTestCli = createCLI({ exitProcess: false }); + const finalTestVersion0File = join(tempDir, "final_test_v0.ensrainbow"); + await finalTestCli.parse([ + "convert", + "--input-file", + csvInputFile, + "--output-file", + finalTestVersion0File, + "--label-set-id", + labelSetId, + ]); + // First ingest succeeds with label set version 0 const ingestCli = createCLI({ exitProcess: false }); await ingestCli.parse([ "ingest-ensrainbow", "--input-file", - firstInputFile, + finalTestVersion0File, "--data-dir", testDataDir, ]); @@ -311,35 +392,45 @@ describe("CLI", () => { const thirdInputFile = join(tempDir, "different_label_set_id_1.ensrainbow"); // Create an ensrainbow file with different label set id - const sqlInputFile = join(TEST_FIXTURES_DIR, "test_ens_names.sql.gz"); + const csvInputFile = join(TEST_FIXTURES_DIR, "test_labels_2col.csv"); const labelSetId = "different-label-set-id"; // Different from test-ens-names - const labelSetVersion = 0; + + // Create temporary directory for version 0 database + const tempDbDir0 = join(tempDir, "temp-db-different-v0"); // Create second file with different label set id and label set version 0 const convertCli = createCLI({ exitProcess: false }); await convertCli.parse([ - "convert-sql", + "convert", "--input-file", - sqlInputFile, + csvInputFile, "--output-file", secondInputFile, "--label-set-id", labelSetId, - "--label-set-version", - labelSetVersion.toString(), ]); // Create third file with different label set id and label set version 1 + // First, ingest version 0 to create database await convertCli.parse([ - "convert-sql", + "ingest-ensrainbow", "--input-file", - sqlInputFile, + secondInputFile, + "--data-dir", + tempDbDir0, + ]); + + // Then create version 1 using existing database + await convertCli.parse([ + "convert", + "--input-file", + csvInputFile, "--output-file", thirdInputFile, "--label-set-id", labelSetId, - "--label-set-version", - "1", + "--existing-db-path", + tempDbDir0, ]); // Verify the file with different label set id was created diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index 35732e4e8..a3fb392bb 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -5,12 +5,7 @@ import type { ArgumentsCamelCase, Argv } from "yargs"; import { hideBin } from "yargs/helpers"; import yargs from "yargs/yargs"; -import { - buildLabelSetId, - buildLabelSetVersion, - type LabelSetId, - type LabelSetVersion, -} from "@ensnode/ensnode-sdk"; +import { buildLabelSetId, type LabelSetId } from "@ensnode/ensnode-sdk"; import { convertCommand } from "@/commands/convert-command-sql"; import { convertCsvCommand } from "@/commands/convert-csv-command"; @@ -59,7 +54,6 @@ interface ConvertArgs { "input-file": string; "output-file"?: string; "label-set-id": LabelSetId; - "label-set-version": LabelSetVersion; } interface ConvertCsvArgs { @@ -256,12 +250,6 @@ export function createCLI(options: CLIOptions = {}) { demandOption: true, }) .coerce("label-set-id", buildLabelSetId) - .option("label-set-version", { - type: "number", - description: "Label set version for the generated ensrainbow file", - demandOption: true, - }) - .coerce("label-set-version", buildLabelSetVersion) .option("output-file", { type: "string", description: "Path to where the resulting ensrainbow file will be output", @@ -269,13 +257,12 @@ export function createCLI(options: CLIOptions = {}) { }, async (argv: ArgumentsCamelCase) => { const outputFile = - argv["output-file"] ?? - join(process.cwd(), `${argv["label-set-id"]}_${argv["label-set-version"]}.ensrainbow`); + argv["output-file"] ?? join(process.cwd(), `${argv["label-set-id"]}_.ensrainbow`); await convertCommand({ inputFile: argv["input-file"], outputFile, labelSetId: argv["label-set-id"], - labelSetVersion: argv["label-set-version"], + labelSetVersion: 0, }); }, ) diff --git a/apps/ensrainbow/src/commands/convert-csv-command.test.ts b/apps/ensrainbow/src/commands/convert-csv-command.test.ts index 706e22166..42b1cdd47 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.test.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.test.ts @@ -107,7 +107,7 @@ describe("convert-csv-command", () => { outputFile, labelSetId: "test-csv-invalid" as LabelSetId, }), - ).rejects.toThrow(/Failed on line 1: Invalid labelHash/); + ).rejects.toThrow(/Failed on line 1: Expected 1 or 2 col/); }); it("should handle CSV with special characters, emojis, unicode, and quoted fields", async () => { diff --git a/apps/ensrainbow/src/commands/convert-csv-command.ts b/apps/ensrainbow/src/commands/convert-csv-command.ts index ac981c537..edeabfbba 100644 --- a/apps/ensrainbow/src/commands/convert-csv-command.ts +++ b/apps/ensrainbow/src/commands/convert-csv-command.ts @@ -316,7 +316,7 @@ function createRainbowRecord(row: string[]): RainbowRecord { labelHash: labelHashBytes, label: label, }; - } else { + } else if (row.length === 2) { // Two columns: validate labelhash format and use provided hash // Trim whitespace from hash (metadata), but preserve label as-is const providedHash = String(row[1]).trim(); @@ -327,13 +327,15 @@ function createRainbowRecord(row: string[]): RainbowRecord { try { const labelHash = labelHashToBytes(maybeLabelHash as LabelHash); // performs labelhash format validation return { - labelHash: labelHash, - label: label, + labelHash, + label, }; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); throw new Error(`Invalid labelHash: ${errorMessage}`); } + } else { + throw new Error(`Expected 1 or 2 columns, but found ${row.length} columns`); } } diff --git a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx index 8aec77266..335a112b5 100644 --- a/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx +++ b/docs/ensnode.io/src/content/docs/ensrainbow/concepts/creating-files.mdx @@ -76,7 +76,7 @@ pnpm run convert \ ### Optional Parameters -- `--output-file`: Output file path (defaults to `rainbow-records-{label-set-id}_{label-set-version}.ensrainbow`) +- `--output-file`: Output file path (defaults to `{label-set-id}_{label-set-version}.ensrainbow`) - `--progress-interval`: Progress logging frequency (default: 50000 records) - `--existing-db-path`: Path to existing ENSRainbow database to filter out existing labels from the generated ensrainbow file and determine the next label set version - `--silent`: Disable progress bar (useful for scripts and automated workflows) @@ -196,7 +196,7 @@ pnpm run convert-sql \ ### Optional Parameters -- `--output-file`: Output file path (defaults to `rainbow-records-{label-set-id}_{label-set-version}.ensrainbow`) +- `--output-file`: Output file path (defaults to `{label-set-id}_{label-set-version}.ensrainbow`) ### Example: Converting Legacy ENS Subgraph Data From 653d200418f5b21cee4ae063874601d262152cca Mon Sep 17 00:00:00 2001 From: "kwrobel.eth" Date: Thu, 8 Jan 2026 00:55:01 +0100 Subject: [PATCH 29/30] Update apps/ensrainbow/src/cli.ts Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- apps/ensrainbow/src/cli.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index a3fb392bb..32ab22b7b 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -257,7 +257,8 @@ export function createCLI(options: CLIOptions = {}) { }, async (argv: ArgumentsCamelCase) => { const outputFile = - argv["output-file"] ?? join(process.cwd(), `${argv["label-set-id"]}_.ensrainbow`); + argv["output-file"] ?? + join(process.cwd(), `${argv["label-set-id"]}_0.ensrainbow`); await convertCommand({ inputFile: argv["input-file"], outputFile, From af0c7f0842f3a0e9070dbd68ddb1ebed431f624b Mon Sep 17 00:00:00 2001 From: djstrong Date: Thu, 8 Jan 2026 00:56:44 +0100 Subject: [PATCH 30/30] lint --- apps/ensrainbow/src/cli.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/ensrainbow/src/cli.ts b/apps/ensrainbow/src/cli.ts index 32ab22b7b..445f41cbe 100644 --- a/apps/ensrainbow/src/cli.ts +++ b/apps/ensrainbow/src/cli.ts @@ -257,8 +257,7 @@ export function createCLI(options: CLIOptions = {}) { }, async (argv: ArgumentsCamelCase) => { const outputFile = - argv["output-file"] ?? - join(process.cwd(), `${argv["label-set-id"]}_0.ensrainbow`); + argv["output-file"] ?? join(process.cwd(), `${argv["label-set-id"]}_0.ensrainbow`); await convertCommand({ inputFile: argv["input-file"], outputFile,