Files
Ecobuddy/Databases/generate_bulk_facilities.py
boris 78508a7cbd erm
Signed-off-by: boris <boris@borishub.co.uk>
2025-04-20 16:49:23 +01:00

568 lines
25 KiB
Python

import sqlite3
import random
import csv
import os
from datetime import datetime
# Connect to the SQLite database
conn = sqlite3.connect('ecobuddy.sqlite')
cursor = conn.cursor()
# Get current max facility ID
cursor.execute("SELECT MAX(id) FROM ecoFacilities")
max_facility_id = cursor.fetchone()[0] or 0
# Get list of user IDs for contributors
cursor.execute("SELECT id FROM ecoUser")
user_ids = [row[0] for row in cursor.fetchall()]
# Get list of categories
cursor.execute("SELECT id, name FROM ecoCategories")
categories = {row[0]: row[1] for row in cursor.fetchall()}
# UK Cities and Towns with their counties and approximate coordinates
uk_locations = [
# Format: Town/City, County, Latitude, Longitude, Postcode Area
("London", "Greater London", 51.5074, -0.1278, "EC"),
("Birmingham", "West Midlands", 52.4862, -1.8904, "B"),
("Manchester", "Greater Manchester", 53.4808, -2.2426, "M"),
("Glasgow", "Glasgow", 55.8642, -4.2518, "G"),
("Liverpool", "Merseyside", 53.4084, -2.9916, "L"),
("Bristol", "Bristol", 51.4545, -2.5879, "BS"),
("Edinburgh", "Edinburgh", 55.9533, -3.1883, "EH"),
("Leeds", "West Yorkshire", 53.8008, -1.5491, "LS"),
("Sheffield", "South Yorkshire", 53.3811, -1.4701, "S"),
("Newcastle upon Tyne", "Tyne and Wear", 54.9783, -1.6178, "NE"),
("Nottingham", "Nottinghamshire", 52.9548, -1.1581, "NG"),
("Cardiff", "Cardiff", 51.4816, -3.1791, "CF"),
("Belfast", "Belfast", 54.5973, -5.9301, "BT"),
("Brighton", "East Sussex", 50.8225, -0.1372, "BN"),
("Leicester", "Leicestershire", 52.6369, -1.1398, "LE"),
("Aberdeen", "Aberdeen", 57.1497, -2.0943, "AB"),
("Portsmouth", "Hampshire", 50.8198, -1.0880, "PO"),
("York", "North Yorkshire", 53.9599, -1.0873, "YO"),
("Swansea", "Swansea", 51.6214, -3.9436, "SA"),
("Oxford", "Oxfordshire", 51.7520, -1.2577, "OX"),
("Cambridge", "Cambridgeshire", 52.2053, 0.1218, "CB"),
("Exeter", "Devon", 50.7184, -3.5339, "EX"),
("Bath", "Somerset", 51.3751, -2.3617, "BA"),
("Reading", "Berkshire", 51.4543, -0.9781, "RG"),
("Preston", "Lancashire", 53.7632, -2.7031, "PR"),
("Coventry", "West Midlands", 52.4068, -1.5197, "CV"),
("Hull", "East Yorkshire", 53.7676, -0.3274, "HU"),
("Stoke-on-Trent", "Staffordshire", 53.0027, -2.1794, "ST"),
("Wolverhampton", "West Midlands", 52.5870, -2.1288, "WV"),
("Plymouth", "Devon", 50.3755, -4.1427, "PL"),
("Derby", "Derbyshire", 52.9225, -1.4746, "DE"),
("Sunderland", "Tyne and Wear", 54.9069, -1.3830, "SR"),
("Southampton", "Hampshire", 50.9097, -1.4044, "SO"),
("Norwich", "Norfolk", 52.6309, 1.2974, "NR"),
("Bournemouth", "Dorset", 50.7192, -1.8808, "BH"),
("Middlesbrough", "North Yorkshire", 54.5742, -1.2350, "TS"),
("Blackpool", "Lancashire", 53.8175, -3.0357, "FY"),
("Bolton", "Greater Manchester", 53.5785, -2.4299, "BL"),
("Ipswich", "Suffolk", 52.0567, 1.1482, "IP"),
("Telford", "Shropshire", 52.6784, -2.4453, "TF"),
("Dundee", "Dundee", 56.4620, -2.9707, "DD"),
("Peterborough", "Cambridgeshire", 52.5695, -0.2405, "PE"),
("Huddersfield", "West Yorkshire", 53.6458, -1.7850, "HD"),
("Luton", "Bedfordshire", 51.8787, -0.4200, "LU"),
("Warrington", "Cheshire", 53.3900, -2.5970, "WA"),
("Southend-on-Sea", "Essex", 51.5459, 0.7077, "SS"),
("Swindon", "Wiltshire", 51.5557, -1.7797, "SN"),
("Slough", "Berkshire", 51.5105, -0.5950, "SL"),
("Watford", "Hertfordshire", 51.6565, -0.3903, "WD"),
("Carlisle", "Cumbria", 54.8952, -2.9335, "CA")
]
# Street name components for generating realistic street names
street_prefixes = ["High", "Main", "Church", "Park", "Mill", "Station", "London", "Victoria", "Queen", "King", "North", "South", "East", "West", "New", "Old", "Castle", "Bridge", "Green", "Market", "School", "Manor", "Abbey", "Priory", "Cathedral", "University", "College", "Hospital", "Railway", "Canal", "River", "Forest", "Wood", "Hill", "Mount", "Valley", "Meadow", "Field", "Farm", "Garden", "Orchard", "Vineyard", "Grange", "Lodge", "Court", "Hall", "House", "Cottage", "Barn", "Mill", "Windmill", "Watermill", "Forge", "Quarry", "Mine", "Pit", "Well", "Spring", "Brook", "Stream", "Lake", "Pond", "Pool", "Reservoir", "Bay", "Cove", "Beach", "Cliff", "Rock", "Stone", "Granite", "Marble", "Slate", "Clay", "Sand", "Gravel", "Chalk", "Flint", "Coal", "Iron", "Steel", "Copper", "Silver", "Gold", "Tin", "Lead", "Zinc", "Brass", "Bronze", "Pewter", "Nickel", "Cobalt", "Chromium", "Titanium", "Aluminium", "Silicon", "Carbon", "Oxygen", "Hydrogen", "Nitrogen", "Helium", "Neon", "Argon", "Krypton", "Xenon", "Radon"]
street_suffixes = ["Street", "Road", "Lane", "Avenue", "Drive", "Boulevard", "Way", "Place", "Square", "Court", "Terrace", "Close", "Crescent", "Gardens", "Grove", "Mews", "Alley", "Walk", "Path", "Trail", "Hill", "Rise", "View", "Heights", "Park", "Green", "Meadow", "Field", "Common", "Heath", "Moor", "Down", "Fell", "Pike", "Tor", "Crag", "Cliff", "Ridge", "Edge", "Top", "Bottom", "Side", "End", "Corner", "Junction", "Cross", "Gate", "Bridge", "Ford", "Ferry", "Wharf", "Quay", "Dock", "Harbor", "Port", "Bay", "Cove", "Beach", "Shore", "Bank", "Strand", "Esplanade", "Parade", "Promenade", "Embankment", "Causeway", "Viaduct", "Tunnel", "Passage", "Arcade", "Gallery", "Mall", "Market", "Bazaar", "Fair", "Exchange", "Mart", "Emporium", "Center", "Circle", "Oval", "Triangle", "Pentagon", "Hexagon", "Octagon", "Circus", "Ring", "Loop", "Bend", "Curve", "Turn", "Twist", "Spiral", "Coil", "Helix", "Maze", "Labyrinth"]
# Facility descriptions by category
category_descriptions = {
1: [ # Recycling Bins
"Public recycling point for paper, glass, plastic, and metal",
"Community recycling station with separate bins for different materials",
"Recycling center with facilities for household waste separation",
"Public access recycling bins for common household recyclables",
"Multi-material recycling point with clear instructions for proper sorting"
],
2: [ # e-Scooters
"Dockless e-scooter rental station with multiple vehicles available",
"E-scooter parking and charging zone for public use",
"Designated e-scooter pickup and drop-off point",
"E-scooter sharing station with app-based rental system",
"Electric scooter hub with maintenance and charging facilities"
],
3: [ # Bike Share Stations
"Public bicycle sharing station with multiple bikes available",
"Bike rental hub with secure docking stations",
"Community bike share point with regular and electric bicycles",
"Cycle hire station with self-service rental system",
"Bike sharing facility with maintenance and repair services"
],
4: [ # Public EV Charging Stations
"Electric vehicle charging point with multiple connectors",
"Fast-charging station for electric vehicles",
"Public EV charging facility with covered waiting area",
"Multi-vehicle electric charging hub with different power options",
"EV charging station with renewable energy source"
],
5: [ # Battery Recycling Points
"Dedicated collection point for used batteries of all sizes",
"Battery recycling bin with separate compartments for different types",
"Safe disposal facility for household and small electronics batteries",
"Battery collection point with educational information about recycling",
"Secure battery recycling station to prevent environmental contamination"
],
6: [ # Community Compost Bins
"Neighborhood composting facility for food and garden waste",
"Community compost bins with educational signage",
"Public composting station with separate sections for different stages",
"Shared compost facility managed by local volunteers",
"Urban composting hub turning food waste into valuable soil"
],
7: [ # Solar-Powered Benches
"Solar bench with USB charging ports and WiFi connectivity",
"Public seating with integrated solar panels and device charging",
"Smart bench powered by solar energy with digital information display",
"Solar-powered rest area with phone charging capabilities",
"Eco-friendly bench with solar panels and LED lighting"
],
8: [ # Green Roofs
"Building with extensive green roof system visible from public areas",
"Accessible green roof garden with native plant species",
"Public building showcasing sustainable rooftop vegetation",
"Green roof installation with educational tours available",
"Biodiverse roof garden with insect habitats and rainwater collection"
],
9: [ # Public Water Refill Stations
"Free water refill station to reduce plastic bottle usage",
"Public drinking fountain with bottle filling capability",
"Water refill point with filtered water options",
"Accessible water station encouraging reusable bottles",
"Community water dispenser with usage counter display"
],
10: [ # Waste Oil Collection Points
"Cooking oil recycling point for residential use",
"Used oil collection facility with secure containers",
"Waste oil drop-off point for conversion to biodiesel",
"Community oil recycling station with spill prevention measures",
"Cooking oil collection facility with educational information"
],
11: [ # Book Swap Stations
"Community book exchange point with weatherproof shelving",
"Public book sharing library in repurposed phone box",
"Free book swap station encouraging reading and reuse",
"Neighborhood book exchange with rotating collection",
"Little free library with take-one-leave-one system"
],
12: [ # Pollinator Gardens
"Public garden designed to support bees and butterflies",
"Pollinator-friendly planting area with native flowering species",
"Community garden dedicated to supporting local insect populations",
"Bee-friendly garden with educational signage about pollinators",
"Urban wildflower meadow supporting biodiversity"
],
13: [ # E-Waste Collection Bins
"Secure collection point for electronic waste and small appliances",
"E-waste recycling bin for phones, computers, and small electronics",
"Electronic waste drop-off point with data security assurance",
"Community e-waste collection facility with regular collection schedule",
"Dedicated bin for responsible disposal of electronic items"
],
14: [ # Clothing Donation Bins
"Textile recycling point for clothes and household fabrics",
"Clothing donation bin supporting local charities",
"Secure collection point for reusable clothing and textiles",
"Community clothing recycling bin with regular collection",
"Textile donation point preventing landfill waste"
],
15: [ # Community Tool Libraries
"Tool lending library for community use and sharing",
"Shared equipment facility reducing need for individual ownership",
"Community resource center for borrowing tools and equipment",
"Tool sharing hub with membership system and workshops",
"Public tool library with wide range of equipment available"
],
16: [ # Urban Farms
"Community-run urban farm providing local produce",
"City farming project with volunteer opportunities",
"Urban agriculture site with educational programs",
"Local food growing initiative in repurposed urban space",
"Community garden with vegetable plots and fruit trees"
],
17: [ # Rainwater Harvesting Systems
"Public demonstration of rainwater collection for irrigation",
"Rainwater harvesting system with educational displays",
"Community rainwater collection facility for shared gardens",
"Visible rainwater storage and filtration system",
"Urban water conservation project with storage tanks"
]
}
# Status comments by category
status_comments = {
1: [ # Recycling Bins
"Recently emptied and cleaned",
"Some bins are nearly full",
"All bins in good condition",
"Paper bin is currently full",
"New signage installed to improve sorting"
],
2: [ # e-Scooters
"All scooters fully charged",
"Three scooters currently available",
"Maintenance scheduled for next week",
"New scooters added to this location",
"High usage area, scooters frequently unavailable"
],
3: [ # Bike Share Stations
"All docking stations operational",
"Five bikes currently available",
"Some bikes need maintenance",
"New electric bikes added",
"Popular station with high turnover"
],
4: [ # Public EV Charging Stations
"All charging points operational",
"Fast charger currently under repair",
"Peak usage during business hours",
"New charging point added last month",
"Payment system recently upgraded"
],
5: [ # Battery Recycling Points
"Collection bin recently emptied",
"Secure container in good condition",
"New signage explaining battery types",
"High usage from local businesses",
"Additional capacity added"
],
6: [ # Community Compost Bins
"Compost ready for collection",
"Needs more brown material",
"Recently turned and aerated",
"New bins added to increase capacity",
"Volunteer day scheduled for maintenance"
],
7: [ # Solar-Powered Benches
"All charging ports working",
"Solar panels recently cleaned",
"WiFi currently unavailable",
"LED lights need replacement",
"High usage during lunch hours"
],
8: [ # Green Roofs
"Plants thriving after recent rain",
"Maintenance scheduled next month",
"New species added to increase biodiversity",
"Irrigation system working well",
"Open for public tours on weekends"
],
9: [ # Public Water Refill Stations
"Water quality tested weekly",
"Fountain cleaned daily",
"Bottle filler counter shows high usage",
"New filter installed recently",
"Popular during summer months"
],
10: [ # Waste Oil Collection Points
"Container recently emptied",
"Secure lid in good condition",
"New funnel system installed",
"Collection schedule posted",
"Area kept clean and tidy"
],
11: [ # Book Swap Stations
"Good selection currently available",
"Children's books needed",
"Recently reorganized by volunteers",
"Weatherproof cover working well",
"High turnover of popular titles"
],
12: [ # Pollinator Gardens
"Plants in full bloom",
"Many bees and butterflies observed",
"New native species planted",
"Volunteer day for maintenance scheduled",
"Educational tours available"
],
13: [ # E-Waste Collection Bins
"Bin recently emptied",
"Secure deposit system working",
"Collection schedule posted",
"New items accepted now include small appliances",
"Data destruction guaranteed"
],
14: [ # Clothing Donation Bins
"Bin recently emptied",
"Clean and well-maintained",
"High quality donations appreciated",
"Winter clothing especially needed",
"Please bag items before donating"
],
15: [ # Community Tool Libraries
"New inventory system implemented",
"Popular tools often unavailable on weekends",
"Tool maintenance workshop scheduled",
"New donations recently added to collection",
"Extended hours during summer"
],
16: [ # Urban Farms
"Seasonal produce currently available",
"Volunteer opportunities posted",
"Educational workshops on weekends",
"New growing area being developed",
"Composting system recently expanded"
],
17: [ # Rainwater Harvesting Systems
"System working efficiently after recent rainfall",
"Water quality monitoring in place",
"Educational tours available by appointment",
"System capacity recently expanded",
"Used for irrigation of nearby community garden"
]
}
# Generate a realistic UK postcode based on area code
def generate_postcode(area_code):
# Format: Area + District + Space + Sector + Unit
# e.g., M1 1AA or SW1A 1AA
district = random.randint(1, 99)
sector = random.randint(1, 9)
unit = ''.join(random.choices('ABCDEFGHJKLMNPQRSTUVWXYZ', k=2)) # Excluding I and O as they're not used
if len(area_code) == 1:
return f"{area_code}{district} {sector}{unit}"
else:
return f"{area_code}{district} {sector}{unit}"
# Generate a realistic street name
def generate_street_name():
prefix = random.choice(street_prefixes)
suffix = random.choice(street_suffixes)
return f"{prefix} {suffix}"
# Generate a realistic house number
def generate_house_number():
# 80% chance of a simple number, 20% chance of a letter suffix or unit
if random.random() < 0.8:
return str(random.randint(1, 200))
else:
options = [
f"{random.randint(1, 200)}{random.choice('ABCDEFG')}", # e.g., 42A
f"Unit {random.randint(1, 20)}",
f"Flat {random.randint(1, 50)}",
f"Suite {random.randint(1, 10)}"
]
return random.choice(options)
# Add small random variation to coordinates to avoid facilities at exact same location
def vary_coordinates(lat, lng):
# Add variation of up to ~500 meters
lat_variation = random.uniform(-0.004, 0.004)
lng_variation = random.uniform(-0.006, 0.006)
return lat + lat_variation, lng + lng_variation
# Generate facility title based on category and location
def generate_title(category_name, location_name, street_name):
templates = [
f"{location_name} {category_name}",
f"{category_name} at {street_name}",
f"{street_name} {category_name}",
f"Community {category_name} {location_name}",
f"{location_name} Central {category_name}",
f"{location_name} {street_name} {category_name}"
]
return random.choice(templates)
# Create a log file to track progress
log_file = open("facility_generation_log.txt", "w")
log_file.write(f"Starting facility generation at {datetime.now()}\n")
log_file.write(f"Target: 1000 new facilities\n\n")
# Create a CSV file to store all generated facilities for reference
csv_file = open("generated_facilities.csv", "w", newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["ID", "Title", "Category", "Description", "Address", "Postcode", "Latitude", "Longitude", "Contributor"])
# Prepare for batch insertion to improve performance
facilities_to_insert = []
status_comments_to_insert = []
# Track unique titles to avoid duplicates
existing_titles = set()
cursor.execute("SELECT title FROM ecoFacilities")
for row in cursor.fetchall():
existing_titles.add(row[0])
# Generate 1000 facilities
num_facilities = 1000
facilities_created = 0
log_file.write("Generating facilities...\n")
while facilities_created < num_facilities:
# Select a random location
location = random.choice(uk_locations)
location_name, county, base_lat, base_lng, postcode_area = location
# Generate 5-25 facilities per location to create clusters
facilities_per_location = min(random.randint(5, 25), num_facilities - facilities_created)
for _ in range(facilities_per_location):
# Select a random category
category_id = random.choice(list(categories.keys()))
category_name = categories[category_id]
# Generate address components
street_name = generate_street_name()
house_number = generate_house_number()
lat, lng = vary_coordinates(base_lat, base_lng)
postcode = generate_postcode(postcode_area)
# Generate title
title_base = generate_title(category_name, location_name, street_name)
title = title_base
# Ensure title is unique by adding a suffix if needed
suffix = 2
while title in existing_titles:
title = f"{title_base} {suffix}"
suffix += 1
existing_titles.add(title)
# Select description
description = random.choice(category_descriptions[category_id])
# Select contributor
contributor_id = random.choice(user_ids)
# Add to batch for insertion
facilities_to_insert.append((
title,
category_id,
description,
house_number,
street_name,
county,
location_name,
postcode,
lng,
lat,
contributor_id
))
# Log progress periodically
facilities_created += 1
if facilities_created % 100 == 0:
log_message = f"Generated {facilities_created} facilities so far..."
print(log_message)
log_file.write(log_message + "\n")
if facilities_created >= num_facilities:
break
# Insert facilities in batches for better performance
log_file.write("\nInserting facilities into database...\n")
print("Inserting facilities into database...")
batch_size = 50
for i in range(0, len(facilities_to_insert), batch_size):
batch = facilities_to_insert[i:i+batch_size]
cursor.executemany("""
INSERT INTO ecoFacilities
(title, category, description, houseNumber, streetName, county, town, postcode, lng, lat, contributor)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", batch)
# Get the IDs of the inserted facilities
cursor.execute("SELECT last_insert_rowid()")
last_id = cursor.fetchone()[0]
first_id_in_batch = last_id - len(batch) + 1
# Generate status comments for each facility
for j, facility in enumerate(batch):
facility_id = first_id_in_batch + j
category_id = facility[1] # Category ID is the second element
# Write to CSV for reference
csv_writer.writerow([
facility_id,
facility[0], # title
categories[category_id], # category name
facility[2], # description
f"{facility[3]} {facility[4]}, {facility[6]}, {facility[5]}", # address
facility[7], # postcode
facility[9], # lat
facility[8], # lng
facility[10] # contributor
])
# Decide how many status comments to add (0-3)
num_comments = random.choices([0, 1, 2, 3], weights=[30, 40, 20, 10])[0]
if num_comments > 0:
# Get relevant comments for this category
relevant_comments = status_comments.get(category_id, status_comments[1]) # Default to recycling bin comments
# Select random comments without repetition
selected_comments = random.sample(relevant_comments, min(num_comments, len(relevant_comments)))
# Add to batch for insertion
for comment in selected_comments:
status_comments_to_insert.append((facility_id, comment))
# Commit after each batch
conn.commit()
log_message = f"Inserted batch {i//batch_size + 1}/{(len(facilities_to_insert)-1)//batch_size + 1}"
print(log_message)
log_file.write(log_message + "\n")
# Insert status comments in batches
if status_comments_to_insert:
log_file.write("\nInserting status comments...\n")
print("Inserting status comments...")
for i in range(0, len(status_comments_to_insert), batch_size):
batch = status_comments_to_insert[i:i+batch_size]
cursor.executemany("""
INSERT INTO ecoFacilityStatus (facilityId, statusComment)
VALUES (?, ?)
""", batch)
conn.commit()
log_message = f"Inserted comment batch {i//batch_size + 1}/{(len(status_comments_to_insert)-1)//batch_size + 1}"
print(log_message)
log_file.write(log_message + "\n")
# Get final counts
cursor.execute("SELECT COUNT(*) FROM ecoFacilities")
total_facilities = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM ecoFacilityStatus")
total_comments = cursor.fetchone()[0]
# Log completion
completion_message = f"\nGeneration complete at {datetime.now()}"
print(completion_message)
log_file.write(completion_message + "\n")
summary = f"Total facilities in database: {total_facilities}\n"
summary += f"Total status comments in database: {total_comments}\n"
summary += f"Generated facilities saved to generated_facilities.csv for reference"
print(summary)
log_file.write(summary)
# Close connections
log_file.close()
csv_file.close()
conn.commit()
conn.close()
print("\nSuccessfully added 1000 new ecological facilities to the database.")
print("A detailed log and CSV export have been created for reference.")