factorio-learning-environment/data/plans/plan_parser.py

import json
import re
from pathlib import Path
from typing import List, Tuple
from dataclasses import dataclass


@dataclass
class Node:
    number: str
    content: str
    children: List["Node"]


def parse_hierarchy(text: str) -> Node:
    """Parse the hierarchical text into a tree structure."""
    lines = [line.strip() for line in text.split("\n") if line.strip()]
    root = Node("0", "root", [])
    current_path = [root]

    for line in lines:
        # Extract the number and content
        match = re.match(r"^(\d+(\.\d+)*)\.\s+(.+)$", line)
        if not match:
            continue

        number, _, content = match.groups()
        depth = len(number.split("."))

        # Create new node
        new_node = Node(number, content, [])

        # Adjust current path
        while len(current_path) > depth:
            current_path.pop()
        if len(current_path) < depth:
            current_path.append(current_path[-1])

        # Add node to parent
        current_path[-1].children.append(new_node)
        current_path[-1] = new_node

    return root


def generate_training_examples(root: Node) -> List[Tuple[str, str]]:
    """Generate training examples from the tree structure."""
    examples = []

    def process_node(node: Node, history: List[str]):
        if not node.children:
            return

        # Generate examples for direct children prediction
        history_text = (
            "\n".join(history + [f"{node.number}. {node.content}"])
            .replace("0. root", "")
            .strip()
        )

        # If node has children, create example for predicting all children
        if node.children:
            history_text += "\n    <>".replace("0. root", "").strip()
            output = "\n".join(
                [f"{child.number}. {child.content}" for child in node.children]
            )
            examples.append((history_text, output))

        # For each child with siblings, create examples for predicting next sibling
        for i, child in enumerate(node.children[:-1]):
            sibling_history = (
                history
                + [f"{node.number}. {node.content}"]
                + [
                    f"   {node.children[j].number}. {node.children[j].content}"
                    for j in range(i + 1)
                ]
            )
            sibling_history_text = (
                "\n".join(sibling_history + ["   <>"]).replace("0. root", "").strip()
            )
            output = f"{node.children[i + 1].number}. {node.children[i + 1].content}"
            examples.append((sibling_history_text, output))

        # For the last child, predict </> token
        if node.children:
            last_child = node.children[-1]
            leaf_history = history + [
                f"{node.number}. {node.content}",
                f"   {last_child.number}. {last_child.content}",
                "   <>",
            ]
            input_str = "\n".join(leaf_history).replace("0. root", "").strip()
            examples.append((input_str, "</>"))

        # Recursively process children
        for child in node.children:
            process_node(child, history + [f"{node.number}. {node.content}"])

    process_node(root, [])
    return examples


def process_file(content: str) -> List[Tuple[str, str]]:
    """Process the file content and return training examples."""
    root = parse_hierarchy(content)
    return generate_training_examples(root)


def main():
    output_file = "factorio_relationships.jsonl"

    processed_dir = Path("./factorio_guides/processed")
    with open(output_file, "a") as fo:
        for file in processed_dir.glob("*.xml"):
            # parser.save_relationships_jsonl(str(file), output_file)
            with open(file, "r") as f:
                content = f.read()
            try:
                content = "1. " + content.split("\n1. ")[1]
                pass
            except:
                pass

            content = content.replace("**", "")

            examples = process_file(content)

            # Save to a format suitable for OpenAI finetuning
            for input_text, output_text in examples:
                fo.write(
                    json.dumps(
                        {
                            "messages": [
                                {
                                    "role": "system",
                                    "content": "You are a helpful assistant that decides on the most appropriate Factorio game objective",
                                },
                                {"role": "user", "content": input_text},
                                {"role": "assistant", "content": output_text},
                            ]
                        }
                    )
                    + "\n"
                )
                print({"prompt": input_text, "completion": output_text})


if __name__ == "__main__":
    main()