Filtering Events

Techniques for filtering and selecting events.

Spatial Filtering

By Bounding Box

use spatial_narrative::core::GeoBounds;

// Define bounds
let nyc = GeoBounds::new(40.4, -74.3, 41.0, -73.7);

// Filter narrative
let nyc_narrative = narrative.filter_spatial(&nyc);

// Or filter events directly
let nyc_events: Vec<_> = events.iter()
    .filter(|e| nyc.contains(&e.location))
    .collect();

By Radius

use spatial_narrative::analysis::haversine_distance;
use spatial_narrative::core::Location;

let center = Location::new(40.7128, -74.0060);
let radius_m = 5000.0;  // 5km

let nearby: Vec<_> = events.iter()
    .filter(|e| haversine_distance(&center, &e.location) <= radius_m)
    .collect();

By Multiple Regions

let regions = vec![
    GeoBounds::new(40.4, -74.3, 41.0, -73.7),   // NYC
    GeoBounds::new(33.7, -118.7, 34.4, -117.9), // LA
    GeoBounds::new(41.6, -88.0, 42.1, -87.5),   // Chicago
];

let in_any_region: Vec<_> = events.iter()
    .filter(|e| regions.iter().any(|r| r.contains(&e.location)))
    .collect();

Temporal Filtering

By Time Range

use spatial_narrative::core::TimeRange;

// Specific range
let range = TimeRange::new(
    Timestamp::parse("2024-01-01T00:00:00Z").unwrap(),
    Timestamp::parse("2024-01-31T23:59:59Z").unwrap(),
);
let january = narrative.filter_temporal(&range);

// Convenience constructors
let q1 = events.iter()
    .filter(|e| TimeRange::month(2024, 1).contains(&e.timestamp)
             || TimeRange::month(2024, 2).contains(&e.timestamp)
             || TimeRange::month(2024, 3).contains(&e.timestamp));

Before/After

let cutoff = Timestamp::parse("2024-06-01T00:00:00Z").unwrap();

let before: Vec<_> = events.iter()
    .filter(|e| e.timestamp < cutoff)
    .collect();

let after: Vec<_> = events.iter()
    .filter(|e| e.timestamp >= cutoff)
    .collect();

By Day of Week

let weekends: Vec<_> = events.iter()
    .filter(|e| {
        let weekday = e.timestamp.weekday();
        weekday == 0 || weekday == 6  // Sunday or Saturday
    })
    .collect();

By Time of Day

// Business hours (9 AM to 5 PM)
let business_hours: Vec<_> = events.iter()
    .filter(|e| {
        let hour = e.timestamp.hour();
        hour >= 9 && hour < 17
    })
    .collect();

Tag Filtering

Single Tag

let important: Vec<_> = events.iter()
    .filter(|e| e.has_tag("important"))
    .collect();

Any of Tags

let priority_tags = ["urgent", "important", "critical"];

let priority: Vec<_> = events.iter()
    .filter(|e| priority_tags.iter().any(|t| e.has_tag(t)))
    .collect();

All of Tags

let required_tags = ["verified", "published"];

let complete: Vec<_> = events.iter()
    .filter(|e| required_tags.iter().all(|t| e.has_tag(t)))
    .collect();

Excluding Tags

let not_spam: Vec<_> = events.iter()
    .filter(|e| !e.has_tag("spam") && !e.has_tag("duplicate"))
    .collect();

Text Filtering

Contains Keyword

let mentions_storm: Vec<_> = events.iter()
    .filter(|e| e.text.to_lowercase().contains("storm"))
    .collect();

Regex Matching

use regex::Regex;

let phone_pattern = Regex::new(r"\d{3}-\d{3}-\d{4}").unwrap();

let with_phone: Vec<_> = events.iter()
    .filter(|e| phone_pattern.is_match(&e.text))
    .collect();

Minimum Length

let substantive: Vec<_> = events.iter()
    .filter(|e| e.text.len() >= 50)
    .collect();

Combined Filtering

Chained Filters

let filtered: Vec<_> = events.iter()
    .filter(|e| nyc_bounds.contains(&e.location))
    .filter(|e| january.contains(&e.timestamp))
    .filter(|e| e.has_tag("verified"))
    .filter(|e| !e.has_tag("duplicate"))
    .collect();

With Predicate Function

fn is_relevant(event: &Event) -> bool {
    // Complex filtering logic
    let in_region = NYC_BOUNDS.contains(&event.location);
    let in_timeframe = STUDY_PERIOD.contains(&event.timestamp);
    let has_content = event.text.len() >= 10;
    let is_verified = event.has_tag("verified");
    
    in_region && in_timeframe && has_content && is_verified
}

let relevant: Vec<_> = events.iter()
    .filter(|e| is_relevant(e))
    .collect();

Using Indexes for Filtering

Spatial Index

use spatial_narrative::index::SpatialIndex;

let index = SpatialIndex::from_iter(events.clone(), |e| &e.location);

// Fast bounding box query
let in_region = index.query_bounds(&bounds);

// Fast radius query
let nearby = index.query_radius_meters(lat, lon, radius);

Temporal Index

use spatial_narrative::index::TemporalIndex;

let index = TemporalIndex::from_iter(events.clone(), |e| &e.timestamp);

// Fast range query
let in_period = index.query_range(&time_range);

// Ordered access
let chronological = index.chronological();

Combined Index

use spatial_narrative::index::SpatiotemporalIndex;

let index = SpatiotemporalIndex::from_iter(
    events.clone(), 
    |e| &e.location, 
    |e| &e.timestamp
);

// Query both dimensions efficiently
let filtered = index.query(&bounds, &time_range);

Sampling

Random Sample

use rand::seq::SliceRandom;

let mut rng = rand::thread_rng();
let sample: Vec<_> = events.choose_multiple(&mut rng, 100).collect();

Systematic Sample

// Every 10th event
let systematic: Vec<_> = events.iter()
    .enumerate()
    .filter(|(i, _)| i % 10 == 0)
    .map(|(_, e)| e)
    .collect();

Stratified Sample

// Sample from each region
let mut samples = Vec::new();
for region in &regions {
    let in_region: Vec<_> = events.iter()
        .filter(|e| region.contains(&e.location))
        .collect();
    samples.extend(in_region.choose_multiple(&mut rng, 10));
}