Tutorial AAMAS 2026
Welcome to the hands-on session! Here’s how this notebook works:
Predict. Before each result we’ll commit a guess via a small poll.
Run. Execute the cell and see whether the algorithm agrees with you.
Play. Sliders let you change the number of traces to learn from, and watch the metrics react.
Compete. We end with a leaderboard challenge — best solver wins.
Tip: in Colab/Jupyter, if you want to run all cells at once, click Cell → Run All only after you’ve made your predictions in each section.
[ ]:
%pip install amlgym ipywidgets matplotlib pandas > /dev/null 2>&1
[ ]:
# Shared utilities used throughout the notebook
import re
import difflib
import ipywidgets as widgets
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display, HTML, Markdown
# ggplot style for every plot
plt.style.use("ggplot")
# ---------- polls & reveals ----------
_POLL_HTML = """<div style='border-left:4px solid #6f42c1;background:#f3eefb;
padding:12px 16px;margin:10px 0;border-radius:6px;
font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif;'>
<div style='font-size:15px;font-weight:600;color:#6f42c1;margin-bottom:4px;
letter-spacing:0.3px;'>{header}</div>
<div style='color:#24292f;font-size:14px;line-height:1.45;'>{question}</div></div>"""
def make_poll(question, options, header="\U0001F52E Cast your prediction"):
"""Render a styled poll. Returns the widget so we can read .value later."""
display(HTML(_POLL_HTML.format(header=header, question=question)))
w = widgets.RadioButtons(options=options, layout=widgets.Layout(width="auto"))
display(w)
return w
def reveal(poll, truth, formatter=str):
"""Reveal whether the audience's guess matches the computed truth."""
chosen = poll.value
if chosen == truth:
accent, bg, border = "#15803d", "#dcfce7", "#16a34a"
icon, msg = "\U0001F31F", "Spot on!"
else:
accent, bg, border = "#92400e", "#fef3c7", "#d97706"
icon, msg = "\U0001F9ED", "Off the mark - but now you know."
truth_str = formatter(truth) if formatter is not str else truth
display(HTML(
"<div style='border-left:4px solid " + border + ";background:" + bg + ";"
"padding:12px 16px;margin:10px 0;border-radius:6px;"
"font-family:-apple-system,BlinkMacSystemFont,\"Segoe UI\",sans-serif;font-size:14px;'>"
"<div style='font-size:15px;font-weight:700;color:" + accent + ";margin-bottom:6px;'>"
+ icon + " " + msg + "</div>"
"<div style='color:#374151;'><b>Your guess:</b> " + str(chosen) + "</div>"
"<div style='color:#374151;'><b>Truth:</b> " + str(truth_str) + "</div></div>"
))
def pick_bucket(value, options):
"""Return the poll option whose numeric range contains ``value``."""
for opt in options:
if re.search(r"\bexactly\s+1", opt, re.I) and value >= 1.0 - 1e-9:
return opt
if re.search(r"\bbelow\b", opt, re.I):
nums = re.findall(r"\d+(?:\.\d+)?", opt)
if nums and value < float(nums[0]):
return opt
nums = re.findall(r"\d+(?:\.\d+)?", opt)
if len(nums) >= 2:
lo, hi = float(nums[0]), float(nums[1])
if lo <= value < hi:
return opt
if len(nums) == 1 and "perfect" in opt.lower() and value >= float(nums[0]) - 1e-9:
return opt
return options[-1]
def _tokenize(text):
text = re.sub(r";[^\n]*", "", text)
return re.findall(r"\(|\)|[^\s()]+", text)
def _parse_all(tokens):
def helper(i):
if tokens[i] == "(":
out, i = [], i + 1
while i < len(tokens) and tokens[i] != ")":
child, i = helper(i)
out.append(child)
return out, i + 1
return tokens[i], i + 1
forms, i = [], 0
while i < len(tokens):
f, i = helper(i)
forms.append(f)
return forms
def _flat(node):
if isinstance(node, str):
return node
return "(" + " ".join(_flat(c) for c in node) + ")"
def _canon_var_list(items):
"""Rename ?vars left-to-right to ?p1, ?p2, ... ; return (new_items, map)."""
out, m, idx, j = [], {}, 0, 0
while j < len(items):
tok = items[j]
if isinstance(tok, str) and tok.startswith("?"):
idx += 1
new = "?p{}".format(idx)
m[tok] = new
out.append(new)
j += 1
if j + 1 < len(items) and items[j] == "-":
out.extend(["-", items[j + 1]])
j += 2
else:
out.append(tok)
j += 1
return out, m
def _canon_predicate_decl(pred):
if not (isinstance(pred, list) and pred):
return pred
new_args, _ = _canon_var_list(pred[1:])
return [pred[0]] + new_args
def _sort_and(node):
if isinstance(node, list) and node and isinstance(node[0], str) and node[0] == "and":
kids = [_sort_and(c) for c in node[1:]]
kids.sort(key=_flat)
return ["and"] + kids
if isinstance(node, list):
return [_sort_and(c) if isinstance(c, list) else c for c in node]
return node
def _canon_action(action):
if not (isinstance(action, list) and action and action[0] == ":action"):
return action
name = action[1]
kv, i = {}, 2
while i < len(action) - 1:
kv[action[i]] = action[i + 1]
i += 2
canon_params, params_map = [], {}
if ":parameters" in kv and isinstance(kv[":parameters"], list):
canon_params, params_map = _canon_var_list(kv[":parameters"])
def rename(n):
if isinstance(n, str):
return params_map.get(n, n)
return [rename(c) for c in n]
out = [":action", name]
if ":parameters" in kv:
out += [":parameters", canon_params]
if ":precondition" in kv:
out += [":precondition", _sort_and(rename(kv[":precondition"]))]
if ":effect" in kv:
out += [":effect", _sort_and(rename(kv[":effect"]))]
return out
def _canon_predicates_block(block):
if not (isinstance(block, list) and block and block[0] == ":predicates"):
return block
preds = [_canon_predicate_decl(p) for p in block[1:]]
preds.sort(key=_flat)
return [":predicates"] + preds
def _render(node, indent=0):
if isinstance(node, str):
return node
if not node:
return "()"
head = node[0]
if isinstance(head, str) and head == "and" and len(node) > 1:
sub_ind = " " * (indent + 1)
body = "\n".join(sub_ind + _render(c, indent + 1) for c in node[1:])
return "(and\n" + body + ")"
if isinstance(head, str) and head == ":action":
ind = " " * (indent + 1)
out = "(:action " + str(node[1])
i = 2
while i < len(node) - 1:
kw, val = node[i], node[i + 1]
val_str = _render(val, indent + 2)
if "\n" in val_str:
out += "\n" + ind + kw + "\n" + ind + " " + val_str.replace("\n", "\n ")
else:
out += "\n" + ind + kw + " " + val_str
i += 2
return out + ")"
if isinstance(head, str) and head == ":predicates":
ind = " " * (indent + 1)
body = "\n".join(ind + _render(p, indent + 1) for p in node[1:])
return "(:predicates\n" + body + ")"
parts = [_render(c, indent + 1) if isinstance(c, list) else c for c in node]
return "(" + " ".join(parts) + ")"
def pddl_canonical(text):
"""Return a canonical, diff-friendly PDDL string."""
try:
forms = _parse_all(_tokenize(text.lower()))
except Exception:
return text
lines = []
for form in forms:
if not isinstance(form, list):
continue
if form and form[0] == "define":
lines.append("(define")
non_actions, actions = [], []
for elem in form[1:]:
if isinstance(elem, list) and elem and elem[0] == ":action":
actions.append(_canon_action(elem))
elif isinstance(elem, list) and elem and elem[0] == ":predicates":
non_actions.append(_canon_predicates_block(elem))
else:
non_actions.append(elem)
actions.sort(key=lambda a: a[1] if len(a) > 1 else "")
for elem in non_actions:
rendered = _render(elem, 1)
lines.append(" " + rendered.replace("\n", "\n "))
for elem in actions:
rendered = _render(elem, 1)
lines.append("")
lines.append(" " + rendered.replace("\n", "\n "))
lines.append(")")
else:
lines.append(_render(form))
return "\n".join(lines) + "\n"
_DIFF_CSS = """<style>
table.diff{font-family:"SF Mono",Menlo,Consolas,monospace;font-size:12px;
border-collapse:collapse;width:100%;border:1px solid #d0d7de;border-radius:6px;
overflow:hidden;margin:8px 0;background:#ffffff;table-layout:fixed;}
table.diff td{padding:1px 8px;vertical-align:top;white-space:pre-wrap;
border:none;line-height:1.45;word-break:break-word;}
table.diff th{background:#f6f8fa;color:#24292f;font-weight:600;
padding:8px 10px;border-bottom:1px solid #d0d7de;text-align:left;}
table.diff td.diff_header{background:#f6f8fa;color:#8b949e;
text-align:right;width:36px;user-select:none;font-weight:500;}
table.diff td.diff_next,table.diff td.diff_next a{display:none;}
.diff_add{background:#d1f4d8;color:#0f5132;}
.diff_chg{background:#fff3bf;color:#7a5d00;}
.diff_sub{background:#ffd0d0;color:#82071e;}
</style>"""
def html_diff(learned_path, ref_path, title_l="Learned", title_r="Reference"):
"""Side-by-side, syntax-aware diff of two PDDL files."""
with open(learned_path) as f1, open(ref_path) as f2:
l1 = pddl_canonical(f1.read()).splitlines(keepends=True)
l2 = pddl_canonical(f2.read()).splitlines(keepends=True)
table = difflib.HtmlDiff(wrapcolumn=80).make_table(
l1, l2, fromdesc=title_l, todesc=title_r,
context=False,
)
display(HTML(_DIFF_CSS + table))
import re
from html import escape
from itertools import zip_longest
from IPython.display import display, HTML
# ---- styling: same shape as html_diff, no diff colors, dark readable text ----
_SXS_CSS = """<style>
table.pddl-sxs{font-family:"JetBrains Mono","SF Mono",Menlo,Consolas,monospace;
font-size:12.5px;border-collapse:collapse;width:100%;border:1px solid #d0d7de;
border-radius:8px;overflow:hidden;margin:8px 0;background:#ffffff;color:#1f2937;
table-layout:fixed;}
table.pddl-sxs th{background:#f6f8fa;color:#24292f;font-weight:600;
padding:10px 12px;border-bottom:1px solid #d0d7de;text-align:left;}
table.pddl-sxs td{padding:2px 10px;vertical-align:top;white-space:pre-wrap;
border:none;line-height:1.55;word-break:break-word;color:#1f2937;}
table.pddl-sxs td.ln{background:#f6f8fa;color:#8b949e;text-align:right;
user-select:none;font-weight:500;border-right:1px solid #eaeef2;}
table.pddl-sxs tr:hover td.code{background:#f9fafb;}
</style>"""
# Color rules — tuned for white background, GitHub-light palette
_PDDL_LOGIC = {"and", "or", "not", "when", "forall", "exists", "imply"}
_PDDL_DEFINE = {"define", "domain", "problem"}
def _pddl_highlight(line):
"""Tokenize a PDDL line and wrap each token in a colored span."""
if not line:
return " "
out, i, next_is_head = [], 0, False
while i < len(line):
ch = line[i]
# comment to end of line
if ch == ";":
out.append(f"<span style='color:#6e7781;font-style:italic'>"
f"{escape(line[i:])}</span>")
break
# whitespace preserved verbatim
if ch in " \t":
j = i
while j < len(line) and line[j] in " \t":
j += 1
out.append(line[i:j])
i = j
continue
# parens (muted)
if ch == "(":
out.append("<span style='color:#8b949e'>(</span>")
i += 1; next_is_head = True
continue
if ch == ")":
out.append("<span style='color:#8b949e'>)</span>")
i += 1
continue
# any other token
j = i
while j < len(line) and line[j] not in " \t()":
j += 1
tok = line[i:j]; i = j
low = tok.lower()
if tok.startswith("?"): # variable
style = "color:#0969da"
elif tok.startswith(":"): # :keyword (incl. :strips, :typing, :action ...)
style = "color:#8250df;font-weight:600"
elif low in _PDDL_LOGIC: # and, not, when, ...
style = "color:#cf222e;font-weight:600"
elif low in _PDDL_DEFINE: # define, domain, problem
style = "color:#8250df;font-weight:700"
elif tok == "-": # type separator
style = "color:#8b949e"
elif next_is_head: # predicate / action / operator head
style = "color:#953800;font-weight:500"
else: # objects, types — default dark slate
style = "color:#1f2937"
out.append(f"<span style='{style}'>{escape(tok)}</span>")
next_is_head = False
return "".join(out) or " "
def html_side_by_side(left_path, right_path,
title_l="Learned", title_r="Reference"):
"""Print two PDDL files side by side with syntax highlighting (no diff)."""
with open(left_path) as f1: l = pddl_canonical(f1.read()).splitlines()
with open(right_path) as f2: r = pddl_canonical(f2.read()).splitlines()
rows = []
for n, (a, b) in enumerate(zip_longest(l, r, fillvalue=""), start=1):
rows.append(
"<tr>"
f"<td class='ln'>{n}</td><td class='code'>{_pddl_highlight(a)}</td>"
f"<td class='ln'>{n}</td><td class='code'>{_pddl_highlight(b)}</td>"
"</tr>"
)
table = (
"<table class='pddl-sxs'>"
"<colgroup><col style='width:36px'><col>"
"<col style='width:36px'><col></colgroup>"
f"<thead><tr><th colspan='2'>{escape(title_l)}</th>"
f"<th colspan='2'>{escape(title_r)}</th></tr></thead>"
"<tbody>" + "".join(rows) + "</tbody></table>"
)
display(HTML(_SXS_CSS + table))
# ---------- plotting helper ----------
_GG_COLORS = ["#e24a33", "#348abd", "#988ed5", "#fbc15e", "#777777", "#8eba42"]
def bar_compare(values, title, ylabel, ylim=(0, 1)):
with plt.style.context("ggplot"):
fig, ax = plt.subplots(figsize=(6, 3.5))
labels = list(values.keys())
ax.bar(labels, [values[k] for k in labels],
color=_GG_COLORS[: len(labels)])
ax.set_title(title)
ax.set_ylabel(ylabel)
if ylim:
ax.set_ylim(*ylim)
for i, v in enumerate(values.values()):
ax.text(i, v + 0.02, "{:.2f}".format(v), ha="center", fontsize=10,
color="#333333")
plt.tight_layout()
plt.show()
# Silence the unified-planning credits banner
import unified_planning
from unified_planning import shortcuts
shortcuts.get_environment().credits_stream = None
1. Exploring the benchmarks
AMLGym ships dozens of IPC domains. Let’s first see what’s available.
[ ]:
from amlgym.benchmarks import print_domains
print_domains()
Pick a domain to inspect: let’s start with blocksworld.
[ ]:
from amlgym.benchmarks import get_domain_path, get_domain
domain_path = get_domain_path('blocksworld')
print(domain_path)
[ ]:
domain_pddl = get_domain('blocksworld')
print(domain_pddl)
Each domain comes with 10 training trajectories generated from a known reference model. These are what the learners get to see.
[ ]:
from amlgym.benchmarks import get_trajectories_path, get_trajectories, get_problems_path
from pprint import pprint
trajectory_paths = get_trajectories_path('blocksworld')
pprint(trajectory_paths)
[ ]:
trajectory = get_trajectories('blocksworld')[0]
print(trajectory)
[ ]:
problem_path = get_problems_path('blocksworld', kind="learning")[0]
with open(problem_path) as f:
print(f.read())
2. Passive learning with full observability — SAM
AMLGym registers several passive learners. Each one assumes something different about the trajectories: full observability, partial observability, or noisy state observations.
[ ]:
from amlgym.algorithms import print_algorithms
print_algorithms()
🔮 Prediction: before we learn
With 10 full-observability trajectories of blocksworld, what do you think SAM’s syntactic precision will be?
[ ]:
poll_sam_precision = make_poll(
"Pick the range you'd bet on:",
["Below 0.5 (a lot of spurious preconditions)",
"0.5 – 0.8 (mostly right but some noise)",
"0.8 – 0.99 (almost perfect)",
"Exactly 1.0 (SAM is safe by construction!)"],
)
Learn the model with SAM
[ ]:
from amlgym.algorithms import get_algorithm
from amlgym.util.util import empty_domain
sam = get_algorithm('sam')
domain_path = get_domain_path('blocksworld')
domain_empty_path = empty_domain(domain_path)
# with open(domain_empty_path, 'r') as f:
# pprint(f.read())
traj_paths = get_trajectories_path('blocksworld')
model_sam = sam.learn(domain_empty_path, traj_paths)
domain_sam_path = 'sam_blocksworld.pddl'
with open(domain_sam_path, 'w') as f:
f.write(model_sam)
print('Saved learned model to', domain_sam_path)
Evaluate: syntactic precision and recall
[ ]:
from amlgym.metrics import syntactic_precision, syntactic_recall
domain_ref_path = get_domain_path('blocksworld')
prec = syntactic_precision(domain_sam_path, domain_ref_path)
rec = syntactic_recall (domain_sam_path, domain_ref_path)
print("Recall:");
pprint(rec)
[ ]:
reveal(
poll_sam_precision,
pick_bucket(prec["mean"], poll_sam_precision.options),
formatter=lambda b: f"{b} (mean precision = {prec['mean']:.3f})",
)
🔍 Diff: learned model vs. reference
Numbers can be misleading — let’s see what SAM actually learned.
[ ]:
html_side_by_side(domain_sam_path, domain_ref_path, "SAM (learned)", "Reference")
🔮 Prediction: problem solving
Now we test the learned model efficacy for solving new planning problems.
[ ]:
poll_solve = make_poll(
"What ratio of test problems can the SAM-learned model solve?",
["Below 0.3", "0.3 – 0.7", "0.7 – 0.99", "1.0 (perfect)"],
)
[ ]:
from amlgym.metrics import problem_solving
probs_paths = get_problems_path('blocksworld', kind='solving')
metrics_sam = problem_solving(domain_sam_path, domain_ref_path, probs_paths, timeout=60)
[ ]:
solve_ratio = metrics_sam.get("solving_ratio", metrics_sam.get("solving", 0.0))
print(f"Solving ratio: {solve_ratio:.2f}")
reveal(
poll_solve,
pick_bucket(solve_ratio, poll_solve.options),
formatter=lambda b: f"{b}",
)
Predictive power
This metric goes beyond problem solving: on a test set of states, does the learned model predict the same applicability and effects as the reference one?
🔮 Prediction: predicted-effects precision
Predictive power asks a tougher question than solving: for every test state, does the learned model give the same predicted effects as the reference?
[ ]:
poll_predeff = make_poll(
"What do you think the mean precision on predicted effects will be for SAM on blocksworld?",
["Below 0.5 (often predicts wrong fluents)",
"0.5 – 0.8 (mostly right, occasional spurious effects)",
"0.8 – 0.99 (almost identical to the reference)",
"Exactly 1.0 (every predicted effect is correct)"],
)
[ ]:
from amlgym.benchmarks import get_test_states
from amlgym.modeling.UPEnv import UPEnv
from amlgym.metrics import predictive_power
all_test_states = get_test_states('blocksworld')
problem_paths = get_problems_path('blocksworld', kind='predictive_power')
problem_path = problem_paths[0]
test_states = all_test_states[problem_path.split('/')[-1]]
simulator_learned = UPEnv(domain_sam_path, problem_path)
simulator_ref = UPEnv(domain_ref_path, problem_path)
predictive_metrics = predictive_power(simulator_learned, simulator_ref, test_states)
pprint(predictive_metrics['applicability'])
[ ]:
pred_eff_prec = float(predictive_metrics["predicted_effects"]["mean_precision"])
reveal(
poll_predeff,
pick_bucket(pred_eff_prec, poll_predeff.options),
formatter=lambda b: f"{b} (mean precision on predicted effects = {pred_eff_prec:.3f})",
)
3. Play with the number of learning traces
Move the slider below to vary how many traces SAM gets to see, and watch precision and recall reaction.
How many trajectories do you actually need?
Does SAM learn the domain after 1 or 3 trajectories? Does it really need all 10?
[ ]:
from ipywidgets import interact_manual, IntSlider
EXPLORE_DOMAIN = "barman"
_dom_path = get_domain_path(EXPLORE_DOMAIN)
_dom_empty = empty_domain(_dom_path)
_trajs = get_trajectories_path(EXPLORE_DOMAIN)
@interact_manual(
n=IntSlider(min=1, max=10, step=1, value=3,
description="# trajectories")
)
def explore_n_traj(n):
learner = get_algorithm("sam")
model = learner.learn(_dom_empty, _trajs[:n])
out = f"sam_n{n}.pddl"
with open(out, "w") as f:
f.write(model)
p = syntactic_precision(out, _dom_path)
r = syntactic_recall (out, _dom_path)
p_overall = p["mean"] if isinstance(p, dict) and "mean" in p else (
sum(p.values()) / max(len(p), 1) if isinstance(p, dict) else float(p))
r_overall = r["mean"] if isinstance(r, dict) and "mean" in r else (
sum(r.values()) / max(len(r), 1) if isinstance(r, dict) else float(r))
bar_compare({"precision": p_overall, "recall": r_overall},
title=f"SAM on {EXPLORE_DOMAIN} with {n} trajectories",
ylabel="score")
4. Question A: NOLAM vs. OffLAM on tpp (syntactic precision)
Both learners see the same noiseless full trajectories from the tpp domain. Who wins on syntactic precision?
NOLAM is built for noisy traces but we set noise=0. OffLAM was designed for partial observability. Who will be most influenced by the adopted assumptions?
[ ]:
poll_qa = make_poll(
"Who do you bet on?",
["OffLAM wins", "NOLAM wins", "It's a tie"],
)
Now let’s run both and see.
[ ]:
offlam = get_algorithm('offlam')
nolam = get_algorithm('nolam', noise=0.0)
domain_path = get_domain_path('tpp')
domain_empty_path = empty_domain(domain_path)
traj_paths = get_trajectories_path('tpp')
domain_offlam = offlam.learn(domain_empty_path, traj_paths)
domain_nolam = nolam.learn (domain_empty_path, traj_paths)
domain_offlam_path = 'offlam_tpp.pddl'
domain_nolam_path = 'nolam_tpp.pddl'
with open(domain_offlam_path, 'w') as f: f.write(domain_offlam)
with open(domain_nolam_path , 'w') as f: f.write(domain_nolam)
print('Models saved.')
[ ]:
p_off = syntactic_precision(domain_offlam_path, domain_path)['mean']
p_nol = syntactic_precision(domain_nolam_path , domain_path)['mean']
r_off = syntactic_recall(domain_offlam_path, domain_path)['mean']
r_nol = syntactic_recall(domain_nolam_path , domain_path)['mean']
bar_compare({'OffLAM': p_off, 'NOLAM': p_nol},
title='Question A — syntactic precision on tpp', ylabel='precision')
bar_compare({'OffLAM': r_off, 'NOLAM': r_nol},
title='syntactic recall on tpp', ylabel='recall')
if p_off > p_nol + 1e-6: winner_a = "OffLAM wins"
elif p_nol > p_off + 1e-6: winner_a = "NOLAM wins"
else: winner_a = "It's a tie"
reveal(poll_qa, winner_a,
formatter=lambda w: f"{w} (OffLAM={p_off:.3f}, NOLAM={p_nol:.3f})")
[ ]:
print('--- OffLAM vs reference ---')
html_side_by_side(domain_offlam_path, domain_path, 'OffLAM', 'Reference')
[ ]:
print('--- NOLAM vs reference ---')
html_side_by_side(domain_nolam_path, domain_path, 'NOLAM', 'Reference')
5. Question B: SAM vs. OffLAM on goldminer (solving ratio)
Same trajectories, different question. Who produces a model that solves more planning problems?
[ ]:
poll_qb = make_poll(
"Who do you bet on?",
["SAM wins", "OffLAM wins", "It's a tie"],
)
[ ]:
sam_b = get_algorithm('sam')
offlam_b = get_algorithm('offlam')
domain_path_b = get_domain_path('goldminer')
domain_empty_b = empty_domain(domain_path_b)
traj_paths_b = get_trajectories_path('goldminer')
model_sam_b = sam_b .learn(domain_empty_b, traj_paths_b)
model_offlam_b = offlam_b.learn(domain_empty_b, traj_paths_b)
sam_path_b = 'sam_goldminer.pddl'
offlam_path_b = 'offlam_goldminer.pddl'
with open(sam_path_b , 'w') as f: f.write(model_sam_b)
with open(offlam_path_b, 'w') as f: f.write(model_offlam_b)
[ ]:
probs_b = get_problems_path('goldminer', kind='solving')
m_sam = problem_solving(sam_path_b , domain_path_b, probs_b, timeout=60, show_progress=False)
m_offlam = problem_solving(offlam_path_b, domain_path_b, probs_b, timeout=60, show_progress=False)
s_sam = m_sam.get('solving_ratio', m_sam .get('solving', 0.0))
s_offlam = m_offlam.get('solving_ratio', m_offlam.get('solving', 0.0))
bar_compare({'SAM': s_sam, 'OffLAM': s_offlam},
title='Question B — solving ratio on goldminer', ylabel='Solving ratio')
if s_sam > s_offlam: winner_b = "SAM wins"
elif s_offlam > s_sam: winner_b = "OffLAM wins"
else: winner_b = "It's a tie"
reveal(poll_qb, winner_b,
formatter=lambda w: f"{w} (SAM={s_sam:.3f}, OffLAM={s_offlam:.3f})")
[ ]:
html_side_by_side(offlam_path_b, domain_path_b, 'OffLAM', 'Reference')
[ ]:
html_side_by_side(sam_path_b, domain_path_b, 'SAM', 'Reference')
6. Final challenge: the leaderboard 🏆
Pick a domain, pick an algorithm, and submit your run. Your solving ratio on the test problems is your score.
Submissions accumulate in the leaderboard below. Whoever’s at the top when the session ends wins.
Tip: try a domain you haven’t seen yet — parking, …
[ ]:
from amlgym.benchmarks import get_domain_names
ALL_DOMAINS = sorted(get_domain_names())
ALL_ALGORITHMS = ["sam", "offlam", "nolam", "rosame"]
leaderboard = [] # accumulating list of dicts
def _score(algo_name, domain_name):
learner = get_algorithm(algo_name)
dom_path = get_domain_path(domain_name)
dom_empty = empty_domain(dom_path)
trajs = get_trajectories_path(domain_name)
model = learner.learn(dom_empty, trajs)
out_path = f"leaderboard_{algo_name}_{domain_name}.pddl"
with open(out_path, "w") as f:
f.write(model)
print(f"Computing problem solving ratio of {algo_name} on {domain_name} ...")
probs = get_problems_path(domain_name, kind="solving")
m = problem_solving(out_path, dom_path, probs,
timeout=3, show_progress=False)
return m.get("solving_ratio", m.get("solving", 0.0)), m
name_w = widgets.Text(value="", placeholder="Your name / team",
description="Name:")
dom_w = widgets.Dropdown(options=ALL_DOMAINS, value=ALL_DOMAINS[0],
description="Domain:")
algo_w = widgets.Dropdown(options=ALL_ALGORITHMS, value="sam",
description="Algorithm:")
go_btn = widgets.Button(description="🚀 Submit run",
button_style="success")
out_w = widgets.Output()
def _on_click(_):
out_w.clear_output()
with out_w:
nm = name_w.value.strip() or "anonymous"
algo, dom = algo_w.value, dom_w.value
print(f"Running {algo} on {dom} ...")
try:
s, _ = _score(algo, dom)
except Exception as e:
print("Run failed:", e)
return
leaderboard.append({
"name": nm, "algorithm": algo, "domain": dom,
"solving_ratio": round(s, 3),
})
df = (pd.DataFrame(leaderboard)
.sort_values("solving_ratio", ascending=False)
.reset_index(drop=True))
df.index = df.index + 1
display(Markdown("### 🏆 Leaderboard"))
display(df)
go_btn.on_click(_on_click)
display(widgets.VBox([name_w, dom_w, algo_w, go_btn, out_w]))
Closing thoughts
We have now seen the full cycle: pick a domain and associated training set of trajectories, pick a learning algorithm, learn a domain, evaluate the domain by measuring syntactic, solving, and predictive power metrics. Two ideas worth taking home:
The right algorithm depends on the data you actually have. SAM is powerful when traces are fully observable; OffLAM when they are partially observable; NOLAM when state observations are noisy.
Syntactic, solving and predictive power metrics are complementary. For example: a model can be syntactically imperfect but still solve every test problem. Evaluate against the task you care about.
Thanks for playing!
📝 Tell us what you think: anonymous questionnaire