# data-import > Import Excel/CSV data into the database with validation - Author: Justinsato - Repository: Justinsato/brickston-ai - Version: 20260203201127 - Stars: 0 - Forks: 0 - Last Updated: 2026-02-06 - Source: https://github.com/Justinsato/brickston-ai - Web: https://mule.run/skillshub/@@Justinsato/brickston-ai~data-import:20260203201127 --- --- name: data-import description: Import Excel/CSV data into the database with validation --- # Data Import Skill ## Overview Standardized patterns for importing Excel and CSV files into the brickston-ai PostgreSQL database. ## File Locations - **Import scripts**: `scripts/import_*.py` - **Data files**: `data/` or uploaded via API - **Target tables**: Various in PostgreSQL ## Creating an Import Script ### Step 1: Script Template Location: `scripts/import_.py` ```python #!/usr/bin/env python3 """ Import script for data. Source: Target: """ import asyncio import pandas as pd from pathlib import Path from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession from sqlalchemy.orm import sessionmaker from sqlalchemy import text import os DATABASE_URL = os.environ.get("DATABASE_URL") async def main(): # Load data file_path = Path("data/source_file.xlsx") df = pd.read_excel(file_path, sheet_name="Sheet1") # Clean and validate df = clean_data(df) # Import to database await import_to_db(df) print(f"Imported {len(df)} records") def clean_data(df: pd.DataFrame) -> pd.DataFrame: """Clean and validate data before import.""" # Rename columns to match DB schema df = df.rename(columns={ "Source Column": "db_column", "Another Column": "another_db_column", }) # Remove nulls in required fields df = df.dropna(subset=["db_column"]) # Type conversions df["amount"] = pd.to_numeric(df["amount"], errors="coerce") df["date_field"] = pd.to_datetime(df["date_field"]) return df async def import_to_db(df: pd.DataFrame): """Import DataFrame to database.""" engine = create_async_engine(DATABASE_URL) async_session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) async with async_session() as session: for _, row in df.iterrows(): await session.execute( text(""" INSERT INTO table_name (col1, col2, col3) VALUES (:col1, :col2, :col3) ON CONFLICT (unique_key) DO UPDATE SET col2 = EXCLUDED.col2, col3 = EXCLUDED.col3 """), { "col1": row["col1"], "col2": row["col2"], "col3": row["col3"], } ) await session.commit() if __name__ == "__main__": asyncio.run(main()) ``` ## Common Patterns ### Excel with Multiple Sheets ```python xlsx = pd.ExcelFile("data/workbook.xlsx") for sheet_name in xlsx.sheet_names: df = pd.read_excel(xlsx, sheet_name=sheet_name) await process_sheet(sheet_name, df) ``` ### Upsert Pattern ```sql INSERT INTO table_name (id, field1, field2) VALUES (:id, :field1, :field2) ON CONFLICT (id) DO UPDATE SET field1 = EXCLUDED.field1, field2 = EXCLUDED.field2, updated_at = NOW() ``` ### Batch Insert for Performance ```python async def batch_insert(session, records, batch_size=1000): for i in range(0, len(records), batch_size): batch = records[i:i + batch_size] await session.execute( text("INSERT INTO table ..."), batch ) await session.commit() print(f"Inserted batch {i // batch_size + 1}") ``` ### Validation with Logging ```python import logging logger = logging.getLogger(__name__) def validate_row(row, row_num): errors = [] if pd.isna(row["required_field"]): errors.append(f"Row {row_num}: missing required_field") if row["amount"] < 0: errors.append(f"Row {row_num}: negative amount") return errors # During import all_errors = [] for i, row in df.iterrows(): all_errors.extend(validate_row(row, i)) if all_errors: for err in all_errors[:10]: # Show first 10 logger.warning(err) raise ValueError(f"{len(all_errors)} validation errors") ``` ## Running Imports ```bash # Set database URL export DATABASE_URL="postgresql+asyncpg://user:pass@host/db" # Run import python scripts/import_data_type.py ``` ## Checklist - [ ] Script created in `scripts/import_*.py` - [ ] Column mapping matches database schema - [ ] Null handling for required fields - [ ] Type conversions (dates, numbers) - [ ] Upsert logic for idempotency - [ ] Validation with clear error messages - [ ] Tested with sample data