A comprehensive system for generating realistic PII (Personally Identifiable Information) training data for Named Entity Recognition (NER) models, specifically designed for Latin American countries with advanced augmentation capabilities.
| Entity Type | Description | Examples |
|---|---|---|
CUSTOMER_NAME |
Full names with country conventions | Juan PΓ©rez GonzΓ‘lez |
ID_NUMBER |
Country-specific ID formats | RUT, CURP, CPF, CΓ©dula |
ADDRESS |
Street addresses and cities | Av. Providencia 123 |
PHONE_NUMBER |
Country-specific phone formats | +56 9 1234 5678 |
EMAIL |
Email addresses with local domains | juan.perez@empresa.cl |
AMOUNT |
Monetary amounts with currency | $150.000 CLP |
SEQ_NUMBER |
Sequential reference numbers | REF-12345, FOLIO-67890 |
DATE |
Various date formats | 15/08/2024, 15 de agosto de 2024 |
DIRECTION |
Direction/orientation info | hacia el centro, rumbo norte |
LOCATION |
Specific location references | Mall Plaza Norte, Aeropuerto SCL |
POSTAL_CODE |
Postal/zip codes | 7500000, 01000 |
REGION |
Region/state information | RegiΓ³n Metropolitana |
DATA_GENERATION/
βββ main_pipeline.py # Main orchestration pipeline
βββ requirements.txt # Python dependencies
βββ README.md # This file
β
βββ generators/ # PII generation modules
β βββ enhanced_pii_generator.py # Enhanced PII types and variety
β βββ negative_examples_generator.py
β
βββ augmentation/ # NLP augmentation modules
β βββ nlp_augmentation.py # NLTK-based augmentation
β
βββ Spacy/ # spaCy-specific modules
β βββ data_generation_noisy.py # Core spaCy data generator
β βββ config.cfg # spaCy training configuration
β
βββ Transformers/ # Transformer-specific modules
β βββ transformer_data_generator.py
β βββ train_transformer_ner.py
β
βββ data_spacy/ # spaCy training data
β βββ train/ # Training data files
β βββ dev/ # Development data files
β βββ test/ # Test data files
β βββ README.md # spaCy data format guide
β
βββ data_transformers/ # Transformers training data
β βββ train/ # Training data files
β βββ dev/ # Development data files
β βββ test/ # Test data files
β βββ README.md # Transformers data format guide
β
βββ database/ # Database management
β βββ database_manager.py # SQLite database operations
β βββ schema.sql # Database schema
β
βββ corruption/ # Data corruption modules
β βββ extreme_corruption.py # Advanced corruption techniques
β
βββ dataset_composer/ # Dataset composition
β βββ mixed_dataset_generator.py
β
βββ tests/ # Test suites
β βββ test_enhanced_pii.py # Enhanced PII tests
β
βββ configs/ # Configuration files
β βββ optimized_config.cfg # Optimized spaCy config
β
βββ docs/ # Documentation
β βββ ARCHITECTURE.md # System architecture
β βββ API_REFERENCE.md # API documentation
β βββ USAGE_EXAMPLES.md # Usage examples
β
βββ examples/ # Example scripts
βββ complete_workflow.py # Complete workflow example
# Clone the repository
git clone https://github.com/andresveraf/DATA_GENERATION.git
cd DATA_GENERATION
# Install dependencies
pip install -r requirements.txt
# Install spaCy language models
python -m spacy download es_core_news_sm
python -m spacy download pt_core_news_sm
# Download NLTK data (will be downloaded automatically on first use)
python -c "import nltk; nltk.download('punkt'); nltk.download('wordnet'); nltk.download('averaged_perceptron_tagger')"
# Install in development mode
pip install -e .
# Install additional development dependencies
pip install pytest pytest-cov black flake8
# π΅ Generate spaCy format data
python main_pipeline.py --mode mixed-dataset --size 10000 --export-formats spacy --output-dir data_spacy/
# π’ Generate Transformers format data
python main_pipeline.py --mode mixed-dataset --size 10000 --export-formats transformers --output-dir data_transformers/
# π Generate both formats simultaneously
python main_pipeline.py --mode mixed-dataset --size 10000 --export-formats spacy,transformers --composition balanced
# π Generate country-specific data
python main_pipeline.py --mode mixed-dataset --size 5000 --export-formats json,transformers --composition chile_focused
from generators.enhanced_pii_generator import EnhancedPIIGenerator
from augmentation.nlp_augmentation import create_augmentator
# Create enhanced PII generator
generator = EnhancedPIIGenerator()
# Generate all PII types for Chile
pii_data = generator.generate_all_pii_types('chile')
print(pii_data)
# Create NLP augmentator
augmentator = create_augmentator(language="es", synonym_rate=0.3, noise_rate=0.1)
# Augment text while preserving entities
text = "El cliente Juan PΓ©rez con RUT 12.345.678-9 reside en Santiago."
entities = [(11, 21, "CUSTOMER_NAME"), (26, 39, "ID_NUMBER"), (50, 58, "ADDRESS")]
augmented_text, updated_entities = augmentator.augment_text(text, entities)
graph TB
A[Main Pipeline] --> B[Enhanced PII Generator]
A --> C[NLP Augmentator]
A --> D[Database Manager]
B --> E[Country-Specific Data]
B --> F[12+ PII Types]
C --> G[Synonym Replacement]
C --> H[Noise Injection]
C --> I[OCR Errors]
D --> J[SQLite Database]
D --> K[Statistics Tracking]
A --> L[Export Formats]
L --> M[spaCy Binary]
L --> N[JSON/CoNLL]
L --> O[CSV/Excel]
M --> P[data_spacy/]
N --> Q[data_transformers/]
O --> R[Analysis Files]
from generators.enhanced_pii_generator import EnhancedPIIGenerator
generator = EnhancedPIIGenerator()
# Generate new PII types
direction = generator.generate_direction('chile') # "hacia el centro"
location = generator.generate_location('chile') # "Mall Plaza Norte"
postal_code = generator.generate_postal_code('chile') # "7500000"
region = generator.generate_region('chile') # "RegiΓ³n Metropolitana"
# Enhanced existing types
phone = generator.generate_enhanced_phone('chile') # "+56 9 1234 5678"
sequence = generator.generate_enhanced_sequence() # "REF-12345"
date = generator.generate_enhanced_date('chile') # "15/08/2024"
from augmentation.nlp_augmentation import NLPAugmentator, AugmentationConfig
# Configure augmentation
config = AugmentationConfig(
synonym_replacement_rate=0.3, # 30% of non-PII words
noise_injection_rate=0.1, # 10% character noise
language="es"
)
augmentator = NLPAugmentator(config)
# Apply augmentation while preserving PII
text = "El cliente vive en Santiago y trabaja en una empresa."
entities = [(50, 58, "ADDRESS")] # "Santiago"
augmented_text, updated_entities = augmentator.augment_text(text, entities)
# Result: "El usuario reside en Santiago y labora en una compaΓ±Γa."
from main_pipeline import EnhancedPIIDataPipeline
# Initialize pipeline
pipeline = EnhancedPIIDataPipeline()
# Generate mixed dataset
dataset = pipeline.generate_mixed_dataset(
size=10000,
composition_name='balanced',
export_formats=['json', 'spacy', 'conll']
)
# Generate with extreme corruption
corrupted_dataset = pipeline.generate_extreme_corruption_dataset(
base_documents=dataset['train_documents'],
corruption_levels=['medium', 'extreme'],
samples_per_level=1000
)
from generators.enhanced_pii_generator import validate_pii_variety
# Validate PII variety for quality assurance
variety_report = validate_pii_variety('chile', samples=1000)
for pii_type, metrics in variety_report.items():
print(f"{pii_type}: {metrics['variety_percentage']:.1f}% variety")
print(f" Sufficient variety: {metrics['sufficient_variety']}")
# Dataset composition options
python main_pipeline.py --mode mixed-dataset --composition balanced # Equal distribution
python main_pipeline.py --mode mixed-dataset --composition pii-heavy # More PII entities
python main_pipeline.py --mode mixed-dataset --composition minimal # Fewer entities
# Export format options
--export-formats json # JSON format only
--export-formats spacy # spaCy binary format (.spacy)
--export-formats transformers # CONLL/BIO format (.conll)
--export-formats json,spacy,transformers # Multiple formats
--export-formats csv # Analysis format
# Country-specific generation
--country chile # Chile only
--country mexico # Mexico only
--country all # All countries
# Augmentation options
--augmentation-enabled # Enable NLP augmentation
--augmentation-rate 0.3 # 30% augmentation rate
--augmentation-types synonyms,noise # Specific augmentation types
train.spacy, dev.spacy# Loading spaCy format
import spacy
from spacy.tokens import DocBin
nlp = spacy.blank("es")
doc_bin = DocBin().from_disk("data_spacy/train.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))
train.conll, dev.conllJuan B-CUSTOMER_NAME
PΓ©rez I-CUSTOMER_NAME
vive O
en O
Santiago B-ADDRESS
# Generate spaCy training data
python main_pipeline.py --mode mixed-dataset --size 20000 --export-formats spacy --output-dir data_spacy/
# Train spaCy model
python -m spacy train configs/optimized_config.cfg --output ./models --paths.train ./data_spacy/train.spacy --paths.dev ./data_spacy/dev.spacy
# Generate transformer training data
python main_pipeline.py --mode mixed-dataset --size 15000 --export-formats json,conll --output-dir data_transformers/
# Train transformer model
cd Transformers/
python train_transformer_ner.py --data_dir ../data_transformers/ --model_name dccuchile/bert-base-spanish-wwm-uncased
# Run comprehensive test suite
python -m pytest tests/ -v
# Run specific test modules
python tests/test_enhanced_pii.py
# Run with coverage
python -m pytest tests/ --cov=generators --cov=augmentation --cov-report=html
# Test enhanced PII generation
python -c "
from generators.enhanced_pii_generator import EnhancedPIIGenerator
generator = EnhancedPIIGenerator()
pii_data = generator.generate_all_pii_types('chile')
for pii_type, value in pii_data.items():
print(f'{pii_type}: {value}')
"
# Test NLP augmentation
python -c "
from augmentation.nlp_augmentation import create_augmentator
augmentator = create_augmentator()
text = 'El cliente Juan PΓ©rez vive en Santiago.'
entities = [(11, 21, 'CUSTOMER_NAME'), (31, 39, 'ADDRESS')]
augmented, _ = augmentator.augment_text(text, entities)
print(f'Original: {text}')
print(f'Augmented: {augmented}')
"
from main_pipeline import EnhancedPIIDataPipeline
# Create pipeline
pipeline = EnhancedPIIDataPipeline()
# Generate balanced dataset
dataset = pipeline.generate_mixed_dataset(
size=5000,
composition_name='balanced',
output_dir='output/',
export_formats=['json', 'spacy']
)
print(f"Generated {len(dataset['train_documents'])} training documents")
print(f"Generated {len(dataset['dev_documents'])} development documents")
from generators.enhanced_pii_generator import EnhancedPIIGenerator
from augmentation.nlp_augmentation import create_augmentator
# Generate Chilean PII data
generator = EnhancedPIIGenerator()
pii_data = generator.generate_all_pii_types('chile')
# Create training sentence
text = f"Cliente {pii_data['CUSTOMER_NAME']} con {pii_data['ID_NUMBER']} reside en {pii_data['ADDRESS']}."
# Define entities
entities = [
(8, 8 + len(pii_data['CUSTOMER_NAME']), "CUSTOMER_NAME"),
(13 + len(pii_data['CUSTOMER_NAME']), 13 + len(pii_data['CUSTOMER_NAME']) + len(pii_data['ID_NUMBER']), "ID_NUMBER"),
(25 + len(pii_data['CUSTOMER_NAME']) + len(pii_data['ID_NUMBER']), 25 + len(pii_data['CUSTOMER_NAME']) + len(pii_data['ID_NUMBER']) + len(pii_data['ADDRESS']), "ADDRESS")
]
# Apply augmentation
augmentator = create_augmentator(language="es", synonym_rate=0.3)
augmented_text, updated_entities = augmentator.augment_text(text, entities)
print(f"Original: {text}")
print(f"Augmented: {augmented_text}")
from generators.enhanced_pii_generator import validate_pii_variety
from main_pipeline import EnhancedPIIDataPipeline
# Generate large dataset
pipeline = EnhancedPIIDataPipeline()
dataset = pipeline.generate_mixed_dataset(size=10000, composition_name='balanced')
# Validate quality
variety_report = validate_pii_variety('chile', samples=1000)
# Print quality metrics
print("Quality Validation Report:")
print("-" * 40)
for pii_type, metrics in variety_report.items():
status = "β" if metrics['sufficient_variety'] else "β"
print(f"{status} {pii_type}: {metrics['variety_percentage']:.1f}% variety")
NLTK Data Missing
python -c "import nltk; nltk.download('punkt'); nltk.download('wordnet')"
spaCy Model Missing
python -m spacy download es_core_news_sm
python -m spacy download pt_core_news_sm
Import Errors
# Ensure project root is in Python path
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
Database Permissions
# Ensure write permissions for database directory
chmod 755 database/
--batch-size parameter--gc-enabled--workers parameter for multi-processing--db-batch-size for bulk inserts# Fork and clone the repository
git clone https://github.com/yourusername/DATA_GENERATION.git
cd DATA_GENERATION
# Create virtual environment
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
# Install in development mode
pip install -e .
pip install -r requirements.txt
# Install development dependencies
pip install pytest pytest-cov black flake8 mypy
# Run tests
python -m pytest tests/
# Format code
black generators/ augmentation/ tests/
# Check style
flake8 generators/ augmentation/ tests/
# Type checking
mypy generators/ augmentation/
git checkout -b feature/new-pii-typepython -m pytest tests/black .This project is licensed under the MIT License - see the LICENSE file for details.
For questions, issues, or contributions:
Made with β€οΈ for the NLP and AI community