Files
factorio-learning-environment/data/plans/plan_crawler.py
kiankyars a62f7f208c Restructure (#274)
* add changes

* refactor: clean up unused Docker scripts and redundant files

Remove unused and redundant files from fle/cluster/docker/:

- main.py: Redundant Python script that duplicates run_local.sh functionality
  with hardcoded ports and basic container management. Not used in workflow.

- install-docker.sh: AWS EC2-specific setup script with hardcoded instance
  (ec2-18-133-239-115) and outdated Amazon Linux commands. Current workflow
  uses docker-compose instead.

- probe.sh: Simple port checking script using lsof. Redundant since
  docker-compose handles health checks and container status monitoring.

- setup_docker_repo.sh: AWS ECR-specific setup with hardcoded AWS account
  (216370203482). Contains mostly commented code and unused ECR repository
  configuration. Not used in current workflow.

- requirements.txt: Redundant Python dependency file. Docker is a system
  dependency, not a Python package. The Python docker SDK is already
  included in pyproject.toml dependencies.

Kept essential files: Dockerfile, build scripts, run scripts, config/,
mods/, and README.md which are actively used in the Docker workflow.

* refactor: move lib/ and tools/ to mods/, clean up fle/env/utils/ (all deleted files were unused), prep for entrypoints/ refactor

* readd

* last undo

* refactor: move evaluator.py to algorithms/mcts, move experiment entrypoints to entrypoints/, update imports accordingly

* version info

* incorrect gitignore

* style: replace all relative imports in fle.agents with absolute imports

* simplify gitignore

* commit

* gitignore

* redo

* update

* No code is importing the fle.agents.data package/module

* exclude data/prompts from ruff lint/format in pre-commit

* yaml

* Files were cleared but not deleted

* finalize Neel's suggestions

* Jul 12, 2025 at 13:29

* Jul 12, 2025 at 15:47

* push

* Jul 12, 2025 at 16:03

* Jul 12, 2025 at 16:03

* Jul 12, 2025 at 16:03

* fix

* fix

* fix

* push

* push

* Jul 12, 2025 at 17:48

* remove publish on merge
2025-07-17 17:12:47 +03:00

515 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import http.client
import math
import os
import random
import time
from multiprocessing import Pool
import requests
import re
from bs4 import BeautifulSoup
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI
from typing import Dict, List, Optional, Tuple
import logging
import urllib.parse
from data.plans.prompts import classifier_prompt, extract_steps
load_dotenv()
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def split_queries(queries: List[str], num_splits: int) -> List[List[str]]:
"""Split queries into approximately equal chunks"""
chunk_size = math.ceil(len(queries) / num_splits)
return [queries[i : i + chunk_size] for i in range(0, len(queries), chunk_size)]
def crawler_worker(args: Tuple[List[str], str, str, int]):
"""Worker function for parallel processing"""
queries, serper_key, openai_key, worker_id = args
crawler = FactorioCrawler(
serper_api_key=serper_key, openai_api_key=openai_key, worker_id=worker_id
)
crawler.process_all(queries)
class FactorioCrawler:
def __init__(self, serper_api_key: str, openai_api_key: str, worker_id: int = 0):
self.serper_api_key = serper_api_key
self.client = OpenAI(api_key=openai_api_key)
self.worker_id = worker_id
self.base_dir = Path("factorio_guides")
self.base_dir.mkdir(exist_ok=True)
# Create visited URLs directory
self.visited_dir = self.base_dir / "visited"
self.visited_dir.mkdir(exist_ok=True)
# Create a worker-specific lock file for URL visiting
self.lock_file = self.visited_dir / f"worker_{worker_id}.lock"
def url_to_filename(self, url: str) -> str:
"""Convert URL to a safe filename"""
# Remove protocol and www
clean_url = re.sub(r"^https?://(www\.)?", "", url)
# Convert to safe filename using URL encoding
return urllib.parse.quote_plus(clean_url)
def is_url_visited(self, url: str) -> bool:
"""Check if URL has been visited before"""
filename = self.url_to_filename(url)
return (self.visited_dir / filename[:200]).exists()
def mark_url_visited(self, url: str):
"""Mark URL as visited"""
filename = self.url_to_filename(url)
(self.visited_dir / filename[:200]).touch()
def clean_content(self, text: str) -> str:
"""Clean and normalize text content"""
# Replace multiple newlines with max two newlines
text = re.sub(r"\n{3,}", "\n\n", text)
# Remove excess whitespace
text = re.sub(r"[ \t]+", " ", text)
# Remove spaces at the beginning of lines
text = re.sub(r"(?m)^[ \t]+", "", text)
# Remove spaces at the end of lines
text = re.sub(r"(?m)[ \t]+$", "", text)
# Ensure consistent newline character
text = text.replace("\r\n", "\n")
# Remove any zero-width spaces or other invisible characters
text = re.sub(r"[\u200B-\u200D\uFEFF]", "", text)
# Standardize unicode quotes and dashes
text = text.replace('"', '"').replace('"', '"')
text = text.replace(""", "'").replace(""", "'")
text = text.replace("", "-").replace("", "-")
return text.strip()
def search_guides(self, query: str) -> Dict:
"""Search for Factorio guides using SERPER API"""
conn = http.client.HTTPSConnection("google.serper.dev")
payload = json.dumps({"q": f"factorio {query} guide tutorial tips", "num": 100})
headers = {"X-API-KEY": self.serper_api_key, "Content-Type": "application/json"}
try:
conn.request("POST", "/search", payload, headers)
res = conn.getresponse()
data = json.loads(res.read().decode("utf-8"))
return data
except Exception as e:
logger.error(f"Error searching guides: {e}")
return {}
def fetch_page_content(self, url: str) -> str:
"""Fetch and parse webpage content"""
try:
response = requests.get(url, headers={"User-Agent": "FactorioCrawler/1.0"})
soup = BeautifulSoup(response.text, "html.parser")
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
content = soup.get_text()
return self.clean_content(content)
except Exception as e:
logger.error(f"Error fetching page {url}: {e}")
return ""
def save_raw_content(self, content: str, url: str):
"""Save raw content to filesystem"""
file_path = self.base_dir / "raw" / f"{hash(url)}.txt"
file_path.parent.mkdir(exist_ok=True)
with open(file_path, "w", encoding="utf-8") as f:
f.write(f"URL: {url}\n\n")
f.write(content)
def classify_content(self, content: str) -> bool:
"""Use GPT-4 to classify if content contains instructions"""
try:
response = self.client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": classifier_prompt},
{
"role": "user",
"content": content[:8000],
}, # Truncate to avoid token limits
],
)
return response.choices[0].message.content.strip().lower() == "true"
except Exception as e:
logger.error(f"Error classifying content: {e}")
return False
def extract_steps(self, content: str) -> Optional[str]:
"""Extract steps from content using GPT-4"""
string = ""
try:
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": extract_steps},
{"role": "user", "content": content[:4000]},
],
)
string = response.choices[0].message.content.strip()
string = string.replace("```xml", "").replace("```", "")
return string
except Exception as e:
logger.error(f"Error extracting steps: {e}\n\n{string}\n===")
return None
def save_guide(self, object: str, url: str):
"""Save processed XML guide"""
file_path = self.base_dir / "processed" / f"{hash(url)}.xml"
file_path.parent.mkdir(exist_ok=True)
with open(file_path, "w", encoding="utf-8") as f:
f.write(object)
def process_all(self, search_queries: List[str]):
"""Main processing function"""
logger.info(
f"Worker {self.worker_id} starting with {len(search_queries)} queries"
)
for query in search_queries:
logger.info(f"Worker {self.worker_id} processing query: {query}")
search_results = self.search_guides(query)
if "organic" not in search_results:
continue
for result in search_results["organic"]:
url = result.get("link")
if not url:
continue
if "youtube" in url.lower():
continue
# Use file-based locking for URL visited check
if self.is_url_visited(url):
logger.info(
f"Worker {self.worker_id} skipping previously visited URL: {url}"
)
continue
# Respect rate limits - different delay for each worker
time.sleep(1 + (self.worker_id * 0.1)) # Stagger requests
# Fetch and save raw content
content = self.fetch_page_content(url)
if not content:
continue
self.save_raw_content(content, url)
# Classify content
if not self.classify_content(content):
self.mark_url_visited(url)
continue
# Extract and save steps
root = self.extract_steps(content)
if root is not None:
self.save_guide(root, url)
# Mark URL as visited after processing
self.mark_url_visited(url)
def main():
SERPER_KEYS = os.getenv("SERPER_KEYS")
SERPER_KEYS = SERPER_KEYS.split(",")
# Configuration
num_workers_per_key = 2
total_workers = len(SERPER_KEYS) * num_workers_per_key
openai_api_key = os.getenv("OPENAI_API_KEY")
# Comprehensive early-game search queries
search_queries = [
# Absolute Basics
"beginner guide first steps",
"getting started tutorial",
"first hour guide",
"basic controls tutorial",
"survival basics guide",
"early game tips tricks",
# Initial Resources
"coal mining guide early",
"iron mining setup",
"copper mining basics",
"stone mining early game",
"manual resource gathering",
"starting resource management",
# First Automation
"burner inserter setup",
"first automation steps",
"basic mining drill layout",
"steam power setup",
"coal power generation",
"basic inserter patterns",
"early smelting setup",
# Science Production
"red science automation",
"green science setup",
"early research guide",
"science pack production",
"research priorities guide",
"laboratory setup guide",
# Basic Infrastructure
"belt layout guide",
"early game logistics",
"transport belt basics",
"inserter mechanics guide",
"chest usage tutorial",
"underground belt guide",
"splitter tutorial basic",
# Power Systems
"basic power setup",
"steam engine layout",
"boiler configuration",
"power management early",
"electricity network guide",
"power pole placement",
# Defense
"early game defense",
"basic military setup",
"turret placement guide",
"ammunition production",
"wall construction guide",
"biter defense basics",
"early game military research",
# Manufacturing
"basic factory layout",
"early production lines",
"assembling machine setup",
"manufacturing priorities",
"component production guide",
"intermediate products guide",
# Resource Processing
"basic ore processing",
"early smelting layouts",
"furnace setup guide",
"plate production tutorial",
"smelting column design",
# Fluids
"basic fluid handling",
"pipe systems tutorial",
"offshore pump setup",
"early oil processing",
"fluid storage basics",
"pump mechanics guide",
# Organization
"main bus concept",
"early base organization",
"factory planning guide",
"basic ratios tutorial",
"production efficiency",
"base layout principles",
# Expansion
"base expansion guide",
"resource outpost setup",
"early trains tutorial",
"expanding production guide",
"scaling up basics",
# Specific Production Lines
"electronic circuit production",
"iron gear wheel automation",
"copper cable manufacturing",
"steel production setup",
"basic materials flow",
# Common Problems
"bottleneck solutions early",
"production backup fixes",
"power shortage guide",
"resource management tips",
"common mistakes avoid",
"troubleshooting guide early",
# Efficiency
"early game optimization",
"basic factory ratios",
"production efficiency guide",
"resource balancing tips",
"throughput optimization",
# Quality of Life
"early automation tips",
"manual crafting guide",
"inventory management",
"quickbar setup guide",
"hotkey optimization",
# Progression Path
"tech tree progression",
"research order guide",
"milestone planning",
"advancement strategy",
"progress benchmarks",
# Starting Automation
"first automation priority",
"automating coal mining",
"basic inserter chains",
"burner phase automation",
"crafting queue efficiency",
"manual to automated transition",
# Early Production
"iron plate automation",
"copper plate production line",
"basic materials flow",
"starter base layout",
"initial factory setup",
"early game ratios",
# Resource Collection
"automated mining setup",
"coal power sustainability",
"ore patch efficiency",
"starting miners layout",
"resource field optimization",
"initial mining outpost",
# Power Management
"early power scaling",
"coal supply automation",
"power consumption planning",
"backup power systems",
"power grid layout",
"electricity management tips",
# Basic Logistics
"initial belt systems",
"early sorting methods",
"basic item routing",
"starter bus design",
"item balancing early",
"material distribution",
# Science Production
"automated red science",
"science pack scaling",
"research automation",
"lab feeding setup",
"science production ratio",
"research facility layout",
# Military Production
"automated ammo production",
"turret feeding systems",
"military supply chain",
"defensive production",
"automated wall building",
"military automation priority",
# Component Production
"gear wheel automation",
"circuit production line",
"inserter manufacturing",
"belt production setup",
"automated components",
"intermediate products flow",
# Early Base Design
"starter factory layout",
"production block design",
"early game spacing",
"factory organization",
"modular design basics",
"scalable layouts early",
# Resource Management
"coal distribution system",
"ore processing layout",
"plate balancing setup",
"resource priority system",
"material overflow handling",
"resource buffer design",
# Production Lines
"assembly line basics",
"production cell design",
"manufacturing blocks",
"automated crafting setup",
"production flow design",
"assembly machine layout",
# Belt Systems
"belt balancing early",
"underground belt usage",
"splitter arrangements",
"belt compression tips",
"belt priority system",
"logistics optimization",
# Power Grid
"steam engine array",
"boiler automation",
"coal power layout",
"power pole coverage",
"early grid design",
"power distribution",
# Factory Planning
"initial base planning",
"expansion preparation",
"growth bottlenecks",
"factory scaling tips",
"base organization",
"design principles early",
# Optimization
"early efficiency tips",
"production bottlenecks",
"throughput optimization",
"resource efficiency",
"automation priorities",
"system bottlenecks",
# Troubleshooting
"common automation issues",
"production line fixes",
"power system problems",
"belt backup solutions",
"inserter timing fixes",
"resource starvation",
# Quality Improvements
"factory efficiency",
"automation upgrades",
"production speed tips",
"system improvements",
"optimization methods",
"performance enhancement",
# Progression
"automation milestones",
"production goals",
"development stages",
"expansion timing",
"upgrade priorities",
"advancement planning",
]
# shuffle queries
search_queries = random.sample(search_queries, len(search_queries))
# Split queries for workers
query_chunks = split_queries(search_queries, total_workers)
# Prepare worker arguments
worker_args = []
for key_idx, serper_key in enumerate(SERPER_KEYS):
for worker_num in range(num_workers_per_key):
worker_id = (key_idx * num_workers_per_key) + worker_num
if worker_id < len(query_chunks): # Ensure we have queries for this worker
worker_args.append(
(query_chunks[worker_id], serper_key, openai_api_key, worker_id)
)
# Start parallel processing
with Pool(processes=total_workers) as pool:
pool.map(crawler_worker, worker_args)
if __name__ == "__main__":
main()