mirror of
https://github.com/JackHopkins/factorio-learning-environment.git
synced 2025-09-06 13:23:58 +00:00

* add changes * refactor: clean up unused Docker scripts and redundant files Remove unused and redundant files from fle/cluster/docker/: - main.py: Redundant Python script that duplicates run_local.sh functionality with hardcoded ports and basic container management. Not used in workflow. - install-docker.sh: AWS EC2-specific setup script with hardcoded instance (ec2-18-133-239-115) and outdated Amazon Linux commands. Current workflow uses docker-compose instead. - probe.sh: Simple port checking script using lsof. Redundant since docker-compose handles health checks and container status monitoring. - setup_docker_repo.sh: AWS ECR-specific setup with hardcoded AWS account (216370203482). Contains mostly commented code and unused ECR repository configuration. Not used in current workflow. - requirements.txt: Redundant Python dependency file. Docker is a system dependency, not a Python package. The Python docker SDK is already included in pyproject.toml dependencies. Kept essential files: Dockerfile, build scripts, run scripts, config/, mods/, and README.md which are actively used in the Docker workflow. * refactor: move lib/ and tools/ to mods/, clean up fle/env/utils/ (all deleted files were unused), prep for entrypoints/ refactor * readd * last undo * refactor: move evaluator.py to algorithms/mcts, move experiment entrypoints to entrypoints/, update imports accordingly * version info * incorrect gitignore * style: replace all relative imports in fle.agents with absolute imports * simplify gitignore * commit * gitignore * redo * update * No code is importing the fle.agents.data package/module * exclude data/prompts from ruff lint/format in pre-commit * yaml * Files were cleared but not deleted * finalize Neel's suggestions * Jul 12, 2025 at 13:29 * Jul 12, 2025 at 15:47 * push * Jul 12, 2025 at 16:03 * Jul 12, 2025 at 16:03 * Jul 12, 2025 at 16:03 * fix * fix * fix * push * push * Jul 12, 2025 at 17:48 * remove publish on merge
515 lines
17 KiB
Python
515 lines
17 KiB
Python
import json
|
||
import http.client
|
||
import math
|
||
import os
|
||
import random
|
||
import time
|
||
from multiprocessing import Pool
|
||
|
||
import requests
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
from pathlib import Path
|
||
|
||
from dotenv import load_dotenv
|
||
from openai import OpenAI
|
||
from typing import Dict, List, Optional, Tuple
|
||
import logging
|
||
import urllib.parse
|
||
from data.plans.prompts import classifier_prompt, extract_steps
|
||
|
||
load_dotenv()
|
||
|
||
|
||
# Set up logging
|
||
logging.basicConfig(level=logging.INFO)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def split_queries(queries: List[str], num_splits: int) -> List[List[str]]:
|
||
"""Split queries into approximately equal chunks"""
|
||
chunk_size = math.ceil(len(queries) / num_splits)
|
||
return [queries[i : i + chunk_size] for i in range(0, len(queries), chunk_size)]
|
||
|
||
|
||
def crawler_worker(args: Tuple[List[str], str, str, int]):
|
||
"""Worker function for parallel processing"""
|
||
queries, serper_key, openai_key, worker_id = args
|
||
crawler = FactorioCrawler(
|
||
serper_api_key=serper_key, openai_api_key=openai_key, worker_id=worker_id
|
||
)
|
||
crawler.process_all(queries)
|
||
|
||
|
||
class FactorioCrawler:
|
||
def __init__(self, serper_api_key: str, openai_api_key: str, worker_id: int = 0):
|
||
self.serper_api_key = serper_api_key
|
||
self.client = OpenAI(api_key=openai_api_key)
|
||
self.worker_id = worker_id
|
||
self.base_dir = Path("factorio_guides")
|
||
self.base_dir.mkdir(exist_ok=True)
|
||
|
||
# Create visited URLs directory
|
||
self.visited_dir = self.base_dir / "visited"
|
||
self.visited_dir.mkdir(exist_ok=True)
|
||
|
||
# Create a worker-specific lock file for URL visiting
|
||
self.lock_file = self.visited_dir / f"worker_{worker_id}.lock"
|
||
|
||
def url_to_filename(self, url: str) -> str:
|
||
"""Convert URL to a safe filename"""
|
||
# Remove protocol and www
|
||
clean_url = re.sub(r"^https?://(www\.)?", "", url)
|
||
# Convert to safe filename using URL encoding
|
||
return urllib.parse.quote_plus(clean_url)
|
||
|
||
def is_url_visited(self, url: str) -> bool:
|
||
"""Check if URL has been visited before"""
|
||
filename = self.url_to_filename(url)
|
||
return (self.visited_dir / filename[:200]).exists()
|
||
|
||
def mark_url_visited(self, url: str):
|
||
"""Mark URL as visited"""
|
||
filename = self.url_to_filename(url)
|
||
(self.visited_dir / filename[:200]).touch()
|
||
|
||
def clean_content(self, text: str) -> str:
|
||
"""Clean and normalize text content"""
|
||
# Replace multiple newlines with max two newlines
|
||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||
|
||
# Remove excess whitespace
|
||
text = re.sub(r"[ \t]+", " ", text)
|
||
|
||
# Remove spaces at the beginning of lines
|
||
text = re.sub(r"(?m)^[ \t]+", "", text)
|
||
|
||
# Remove spaces at the end of lines
|
||
text = re.sub(r"(?m)[ \t]+$", "", text)
|
||
|
||
# Ensure consistent newline character
|
||
text = text.replace("\r\n", "\n")
|
||
|
||
# Remove any zero-width spaces or other invisible characters
|
||
text = re.sub(r"[\u200B-\u200D\uFEFF]", "", text)
|
||
|
||
# Standardize unicode quotes and dashes
|
||
text = text.replace('"', '"').replace('"', '"')
|
||
text = text.replace(""", "'").replace(""", "'")
|
||
text = text.replace("—", "-").replace("–", "-")
|
||
|
||
return text.strip()
|
||
|
||
def search_guides(self, query: str) -> Dict:
|
||
"""Search for Factorio guides using SERPER API"""
|
||
conn = http.client.HTTPSConnection("google.serper.dev")
|
||
payload = json.dumps({"q": f"factorio {query} guide tutorial tips", "num": 100})
|
||
headers = {"X-API-KEY": self.serper_api_key, "Content-Type": "application/json"}
|
||
|
||
try:
|
||
conn.request("POST", "/search", payload, headers)
|
||
res = conn.getresponse()
|
||
data = json.loads(res.read().decode("utf-8"))
|
||
return data
|
||
except Exception as e:
|
||
logger.error(f"Error searching guides: {e}")
|
||
return {}
|
||
|
||
def fetch_page_content(self, url: str) -> str:
|
||
"""Fetch and parse webpage content"""
|
||
try:
|
||
response = requests.get(url, headers={"User-Agent": "FactorioCrawler/1.0"})
|
||
soup = BeautifulSoup(response.text, "html.parser")
|
||
# Remove script and style elements
|
||
for script in soup(["script", "style"]):
|
||
script.decompose()
|
||
content = soup.get_text()
|
||
return self.clean_content(content)
|
||
except Exception as e:
|
||
logger.error(f"Error fetching page {url}: {e}")
|
||
return ""
|
||
|
||
def save_raw_content(self, content: str, url: str):
|
||
"""Save raw content to filesystem"""
|
||
file_path = self.base_dir / "raw" / f"{hash(url)}.txt"
|
||
file_path.parent.mkdir(exist_ok=True)
|
||
|
||
with open(file_path, "w", encoding="utf-8") as f:
|
||
f.write(f"URL: {url}\n\n")
|
||
f.write(content)
|
||
|
||
def classify_content(self, content: str) -> bool:
|
||
"""Use GPT-4 to classify if content contains instructions"""
|
||
try:
|
||
response = self.client.chat.completions.create(
|
||
model="gpt-4",
|
||
messages=[
|
||
{"role": "system", "content": classifier_prompt},
|
||
{
|
||
"role": "user",
|
||
"content": content[:8000],
|
||
}, # Truncate to avoid token limits
|
||
],
|
||
)
|
||
return response.choices[0].message.content.strip().lower() == "true"
|
||
except Exception as e:
|
||
logger.error(f"Error classifying content: {e}")
|
||
return False
|
||
|
||
def extract_steps(self, content: str) -> Optional[str]:
|
||
"""Extract steps from content using GPT-4"""
|
||
string = ""
|
||
try:
|
||
response = self.client.chat.completions.create(
|
||
model="gpt-4o",
|
||
messages=[
|
||
{"role": "system", "content": extract_steps},
|
||
{"role": "user", "content": content[:4000]},
|
||
],
|
||
)
|
||
|
||
string = response.choices[0].message.content.strip()
|
||
string = string.replace("```xml", "").replace("```", "")
|
||
return string
|
||
except Exception as e:
|
||
logger.error(f"Error extracting steps: {e}\n\n{string}\n===")
|
||
return None
|
||
|
||
def save_guide(self, object: str, url: str):
|
||
"""Save processed XML guide"""
|
||
file_path = self.base_dir / "processed" / f"{hash(url)}.xml"
|
||
file_path.parent.mkdir(exist_ok=True)
|
||
|
||
with open(file_path, "w", encoding="utf-8") as f:
|
||
f.write(object)
|
||
|
||
def process_all(self, search_queries: List[str]):
|
||
"""Main processing function"""
|
||
logger.info(
|
||
f"Worker {self.worker_id} starting with {len(search_queries)} queries"
|
||
)
|
||
|
||
for query in search_queries:
|
||
logger.info(f"Worker {self.worker_id} processing query: {query}")
|
||
search_results = self.search_guides(query)
|
||
|
||
if "organic" not in search_results:
|
||
continue
|
||
|
||
for result in search_results["organic"]:
|
||
url = result.get("link")
|
||
if not url:
|
||
continue
|
||
|
||
if "youtube" in url.lower():
|
||
continue
|
||
|
||
# Use file-based locking for URL visited check
|
||
if self.is_url_visited(url):
|
||
logger.info(
|
||
f"Worker {self.worker_id} skipping previously visited URL: {url}"
|
||
)
|
||
continue
|
||
|
||
# Respect rate limits - different delay for each worker
|
||
time.sleep(1 + (self.worker_id * 0.1)) # Stagger requests
|
||
|
||
# Fetch and save raw content
|
||
content = self.fetch_page_content(url)
|
||
if not content:
|
||
continue
|
||
|
||
self.save_raw_content(content, url)
|
||
|
||
# Classify content
|
||
if not self.classify_content(content):
|
||
self.mark_url_visited(url)
|
||
continue
|
||
|
||
# Extract and save steps
|
||
root = self.extract_steps(content)
|
||
if root is not None:
|
||
self.save_guide(root, url)
|
||
|
||
# Mark URL as visited after processing
|
||
self.mark_url_visited(url)
|
||
|
||
|
||
def main():
|
||
SERPER_KEYS = os.getenv("SERPER_KEYS")
|
||
SERPER_KEYS = SERPER_KEYS.split(",")
|
||
# Configuration
|
||
num_workers_per_key = 2
|
||
total_workers = len(SERPER_KEYS) * num_workers_per_key
|
||
openai_api_key = os.getenv("OPENAI_API_KEY")
|
||
|
||
# Comprehensive early-game search queries
|
||
search_queries = [
|
||
# Absolute Basics
|
||
"beginner guide first steps",
|
||
"getting started tutorial",
|
||
"first hour guide",
|
||
"basic controls tutorial",
|
||
"survival basics guide",
|
||
"early game tips tricks",
|
||
# Initial Resources
|
||
"coal mining guide early",
|
||
"iron mining setup",
|
||
"copper mining basics",
|
||
"stone mining early game",
|
||
"manual resource gathering",
|
||
"starting resource management",
|
||
# First Automation
|
||
"burner inserter setup",
|
||
"first automation steps",
|
||
"basic mining drill layout",
|
||
"steam power setup",
|
||
"coal power generation",
|
||
"basic inserter patterns",
|
||
"early smelting setup",
|
||
# Science Production
|
||
"red science automation",
|
||
"green science setup",
|
||
"early research guide",
|
||
"science pack production",
|
||
"research priorities guide",
|
||
"laboratory setup guide",
|
||
# Basic Infrastructure
|
||
"belt layout guide",
|
||
"early game logistics",
|
||
"transport belt basics",
|
||
"inserter mechanics guide",
|
||
"chest usage tutorial",
|
||
"underground belt guide",
|
||
"splitter tutorial basic",
|
||
# Power Systems
|
||
"basic power setup",
|
||
"steam engine layout",
|
||
"boiler configuration",
|
||
"power management early",
|
||
"electricity network guide",
|
||
"power pole placement",
|
||
# Defense
|
||
"early game defense",
|
||
"basic military setup",
|
||
"turret placement guide",
|
||
"ammunition production",
|
||
"wall construction guide",
|
||
"biter defense basics",
|
||
"early game military research",
|
||
# Manufacturing
|
||
"basic factory layout",
|
||
"early production lines",
|
||
"assembling machine setup",
|
||
"manufacturing priorities",
|
||
"component production guide",
|
||
"intermediate products guide",
|
||
# Resource Processing
|
||
"basic ore processing",
|
||
"early smelting layouts",
|
||
"furnace setup guide",
|
||
"plate production tutorial",
|
||
"smelting column design",
|
||
# Fluids
|
||
"basic fluid handling",
|
||
"pipe systems tutorial",
|
||
"offshore pump setup",
|
||
"early oil processing",
|
||
"fluid storage basics",
|
||
"pump mechanics guide",
|
||
# Organization
|
||
"main bus concept",
|
||
"early base organization",
|
||
"factory planning guide",
|
||
"basic ratios tutorial",
|
||
"production efficiency",
|
||
"base layout principles",
|
||
# Expansion
|
||
"base expansion guide",
|
||
"resource outpost setup",
|
||
"early trains tutorial",
|
||
"expanding production guide",
|
||
"scaling up basics",
|
||
# Specific Production Lines
|
||
"electronic circuit production",
|
||
"iron gear wheel automation",
|
||
"copper cable manufacturing",
|
||
"steel production setup",
|
||
"basic materials flow",
|
||
# Common Problems
|
||
"bottleneck solutions early",
|
||
"production backup fixes",
|
||
"power shortage guide",
|
||
"resource management tips",
|
||
"common mistakes avoid",
|
||
"troubleshooting guide early",
|
||
# Efficiency
|
||
"early game optimization",
|
||
"basic factory ratios",
|
||
"production efficiency guide",
|
||
"resource balancing tips",
|
||
"throughput optimization",
|
||
# Quality of Life
|
||
"early automation tips",
|
||
"manual crafting guide",
|
||
"inventory management",
|
||
"quickbar setup guide",
|
||
"hotkey optimization",
|
||
# Progression Path
|
||
"tech tree progression",
|
||
"research order guide",
|
||
"milestone planning",
|
||
"advancement strategy",
|
||
"progress benchmarks",
|
||
# Starting Automation
|
||
"first automation priority",
|
||
"automating coal mining",
|
||
"basic inserter chains",
|
||
"burner phase automation",
|
||
"crafting queue efficiency",
|
||
"manual to automated transition",
|
||
# Early Production
|
||
"iron plate automation",
|
||
"copper plate production line",
|
||
"basic materials flow",
|
||
"starter base layout",
|
||
"initial factory setup",
|
||
"early game ratios",
|
||
# Resource Collection
|
||
"automated mining setup",
|
||
"coal power sustainability",
|
||
"ore patch efficiency",
|
||
"starting miners layout",
|
||
"resource field optimization",
|
||
"initial mining outpost",
|
||
# Power Management
|
||
"early power scaling",
|
||
"coal supply automation",
|
||
"power consumption planning",
|
||
"backup power systems",
|
||
"power grid layout",
|
||
"electricity management tips",
|
||
# Basic Logistics
|
||
"initial belt systems",
|
||
"early sorting methods",
|
||
"basic item routing",
|
||
"starter bus design",
|
||
"item balancing early",
|
||
"material distribution",
|
||
# Science Production
|
||
"automated red science",
|
||
"science pack scaling",
|
||
"research automation",
|
||
"lab feeding setup",
|
||
"science production ratio",
|
||
"research facility layout",
|
||
# Military Production
|
||
"automated ammo production",
|
||
"turret feeding systems",
|
||
"military supply chain",
|
||
"defensive production",
|
||
"automated wall building",
|
||
"military automation priority",
|
||
# Component Production
|
||
"gear wheel automation",
|
||
"circuit production line",
|
||
"inserter manufacturing",
|
||
"belt production setup",
|
||
"automated components",
|
||
"intermediate products flow",
|
||
# Early Base Design
|
||
"starter factory layout",
|
||
"production block design",
|
||
"early game spacing",
|
||
"factory organization",
|
||
"modular design basics",
|
||
"scalable layouts early",
|
||
# Resource Management
|
||
"coal distribution system",
|
||
"ore processing layout",
|
||
"plate balancing setup",
|
||
"resource priority system",
|
||
"material overflow handling",
|
||
"resource buffer design",
|
||
# Production Lines
|
||
"assembly line basics",
|
||
"production cell design",
|
||
"manufacturing blocks",
|
||
"automated crafting setup",
|
||
"production flow design",
|
||
"assembly machine layout",
|
||
# Belt Systems
|
||
"belt balancing early",
|
||
"underground belt usage",
|
||
"splitter arrangements",
|
||
"belt compression tips",
|
||
"belt priority system",
|
||
"logistics optimization",
|
||
# Power Grid
|
||
"steam engine array",
|
||
"boiler automation",
|
||
"coal power layout",
|
||
"power pole coverage",
|
||
"early grid design",
|
||
"power distribution",
|
||
# Factory Planning
|
||
"initial base planning",
|
||
"expansion preparation",
|
||
"growth bottlenecks",
|
||
"factory scaling tips",
|
||
"base organization",
|
||
"design principles early",
|
||
# Optimization
|
||
"early efficiency tips",
|
||
"production bottlenecks",
|
||
"throughput optimization",
|
||
"resource efficiency",
|
||
"automation priorities",
|
||
"system bottlenecks",
|
||
# Troubleshooting
|
||
"common automation issues",
|
||
"production line fixes",
|
||
"power system problems",
|
||
"belt backup solutions",
|
||
"inserter timing fixes",
|
||
"resource starvation",
|
||
# Quality Improvements
|
||
"factory efficiency",
|
||
"automation upgrades",
|
||
"production speed tips",
|
||
"system improvements",
|
||
"optimization methods",
|
||
"performance enhancement",
|
||
# Progression
|
||
"automation milestones",
|
||
"production goals",
|
||
"development stages",
|
||
"expansion timing",
|
||
"upgrade priorities",
|
||
"advancement planning",
|
||
]
|
||
|
||
# shuffle queries
|
||
search_queries = random.sample(search_queries, len(search_queries))
|
||
|
||
# Split queries for workers
|
||
query_chunks = split_queries(search_queries, total_workers)
|
||
|
||
# Prepare worker arguments
|
||
worker_args = []
|
||
for key_idx, serper_key in enumerate(SERPER_KEYS):
|
||
for worker_num in range(num_workers_per_key):
|
||
worker_id = (key_idx * num_workers_per_key) + worker_num
|
||
if worker_id < len(query_chunks): # Ensure we have queries for this worker
|
||
worker_args.append(
|
||
(query_chunks[worker_id], serper_key, openai_api_key, worker_id)
|
||
)
|
||
|
||
# Start parallel processing
|
||
with Pool(processes=total_workers) as pool:
|
||
pool.map(crawler_worker, worker_args)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|