{
 "schema": "vela.frontier-twin.v0.1",
 "what": "Machine twin of the frontier page family: the same loader output the human pages render, re-serialized. The page footer prints this document's sha256 as computed at bake.",
 "slug": "benchmark-state",
 "vfr_id": "vfr_efc649fd772a1ff1",
 "name": "AI-for-science benchmark state",
 "head": {
  "id": "vev_d199cb2e417c4f42",
  "after_hash": "sha256:d02f91f588dc319a78e0f134eec5e6deafce383ae772693ec4cfb548e76e60d0"
 },
 "reproduce": "vela reproduce projects/benchmark-state",
 "snapshot_hash": null,
 "stats": {
  "avg_confidence": 0.3,
  "categories": {
   "computational": 12
  },
  "condition_record_count": 12,
  "confidence_distribution": {
   "high_gt_80": 0,
   "low_lt_60": 12,
   "medium_60_80": 0
  },
  "confidence_update_count": 0,
  "contested": 0,
  "event_count": 24,
  "evidence_atom_count": 12,
  "findings": 12,
  "gaps": 12,
  "human_reviewed": 12,
  "link_types": {},
  "links": 0,
  "negative_space": 0,
  "proposal_count": 24,
  "replicated": 0,
  "review_event_count": 24,
  "source_count": 1,
  "unreplicated": 12
 },
 "frontier": {
  "compiled_at": "2026-06-10T06:50:03Z",
  "compiler": "vela/0.691.0",
  "dependencies": [],
  "description": "",
  "errors": 0,
  "name": "AI-for-science benchmark state",
  "papers_processed": 0
 },
 "proof_state": {
  "latest_packet": {
   "event_log_hash": null,
   "generated_at": null,
   "packet_manifest_hash": null,
   "snapshot_hash": null,
   "status": "never_exported"
  }
 },
 "actors": [
  {
   "algorithm": "ed25519",
   "created_at": "2026-06-10T06:50:03.904109+00:00",
   "id": "reviewer:will-blair",
   "public_key": "4892f93877e637b5f59af31d9ec6704814842fb278cacb0eb94704baef99455e"
  }
 ],
 "events": [
  {
   "k": 0,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:7bfbae3a43915a84470520c2e45c24541de96eb8c25ea257d50f1556e981f476",
   "before_hash": "sha256:null",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "id": "vev_5a33eaff97407ac8",
   "kind": "finding.asserted",
   "payload": {
    "proposal_id": "vpr_ce433e03bf245f79"
   },
   "reason": "Manual finding added to frontier state",
   "schema": "vela.event.v0.1",
   "target": {
    "id": "vf_cf89ac0f36e62089",
    "type": "finding"
   },
   "timestamp": "2026-06-10T06:50:55.819775+00:00"
  },
  {
   "k": 1,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:b381760051914d504474356d819f4209161578ecc5766a73786773482e49d131",
   "before_hash": "sha256:null",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "id": "vev_b396d3a2727ae019",
   "kind": "finding.asserted",
   "payload": {
    "proposal_id": "vpr_f3a3a73919f9eb51"
   },
   "reason": "Manual finding added to frontier state",
   "schema": "vela.event.v0.1",
   "target": {
    "id": "vf_55068262f49df0ab",
    "type": "finding"
   },
   "timestamp": "2026-06-10T06:50:55.829852+00:00"
  },
  {
   "k": 2,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:ac87dcff1ebd974fcde4edfeb726aa72fca6b0146f00342156265bf6a1475f23",
   "before_hash": "sha256:null",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "id": "vev_e5d45a5605897295",
   "kind": "finding.asserted",
   "payload": {
    "proposal_id": "vpr_f08818383df2a902"
   },
   "reason": "Manual finding added to frontier state",
   "schema": "vela.event.v0.1",
   "target": {
    "id": "vf_368ec6ffb5747092",
    "type": "finding"
   },
   "timestamp": "2026-06-10T06:50:55.840411+00:00"
  },
  {
   "k": 3,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:42db6392a66750c512302f7564b0937e6becfdd045048c59525795cf3131ff8a",
   "before_hash": "sha256:null",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "id": "vev_03b2b7f5e7e0be96",
   "kind": "finding.asserted",
   "payload": {
    "proposal_id": "vpr_8ebb01be4aedad3b"
   },
   "reason": "Manual finding added to frontier state",
   "schema": "vela.event.v0.1",
   "target": {
    "id": "vf_9a454a597ddee070",
    "type": "finding"
   },
   "timestamp": "2026-06-10T06:50:55.851504+00:00"
  },
  {
   "k": 4,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:6c12d8a2c4b3bf90ac2f228bb6c06af78c9063b42c2f01b1496dc6a10e1c18bd",
   "before_hash": "sha256:null",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "id": "vev_fc4e6c758136cecd",
   "kind": "finding.asserted",
   "payload": {
    "proposal_id": "vpr_684dfb8e796321d2"
   },
   "reason": "Manual finding added to frontier state",
   "schema": "vela.event.v0.1",
   "target": {
    "id": "vf_fec6f956d525e753",
    "type": "finding"
   },
   "timestamp": "2026-06-10T06:50:55.862941+00:00"
  },
  {
   "k": 5,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:7241f04d549de6875e9bb7b04fa8425bf968050e83e2a57ad1694c4314377f86",
   "before_hash": "sha256:null",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "id": "vev_31b40ef5e25c88b6",
   "kind": "finding.asserted",
   "payload": {
    "proposal_id": "vpr_965bbdde5ff53044"
   },
   "reason": "Manual finding added to frontier state",
   "schema": "vela.event.v0.1",
   "target": {
    "id": "vf_dce7a34adf2878f2",
    "type": "finding"
   },
   "timestamp": "2026-06-10T06:50:55.874704+00:00"
  },
  {
   "k": 6,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:bc813b05518c754e699699d8b8acefc309ae53a48e262f325c1fe21dc309a3c0",
   "before_hash": "sha256:null",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "id": "vev_f17f5a864754e2a0",
   "kind": "finding.asserted",
   "payload": {
    "proposal_id": "vpr_d3f3228bb463c2d9"
   },
   "reason": "Manual finding added to frontier state",
   "schema": "vela.event.v0.1",
   "target": {
    "id": "vf_ec4bb8feca206bf2",
    "type": "finding"
   },
   "timestamp": "2026-06-10T06:50:55.887372+00:00"
  },
  {
   "k": 7,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:fffbd83e2d04817fcbabfd6be3032da4b29f94be8dfdf0a440720c6c51dfe3ab",
   "before_hash": "sha256:null",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "id": "vev_bb98a228e1f4a5a7",
   "kind": "finding.asserted",
   "payload": {
    "proposal_id": "vpr_7d99f50cb897566d"
   },
   "reason": "Manual finding added to frontier state",
   "schema": "vela.event.v0.1",
   "target": {
    "id": "vf_03776d6cd3e0801b",
    "type": "finding"
   },
   "timestamp": "2026-06-10T06:50:55.900573+00:00"
  },
  {
   "k": 8,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:ead759552534e5c6bdaa70c3a3849e984bec6c2a0ad042cbaf7cc7ea2c996bb3",
   "before_hash": "sha256:null",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "id": "vev_39ad7234d713069a",
   "kind": "finding.asserted",
   "payload": {
    "proposal_id": "vpr_212f7910ba3adf91"
   },
   "reason": "Manual finding added to frontier state",
   "schema": "vela.event.v0.1",
   "target": {
    "id": "vf_170c9a0e01a9b1d3",
    "type": "finding"
   },
   "timestamp": "2026-06-10T06:50:55.914854+00:00"
  },
  {
   "k": 9,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:87ed48a736771700b582f05c238d84557330b6bacb0a293ccf227bb9649483e6",
   "before_hash": "sha256:null",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "id": "vev_bd0ec86a1be50d66",
   "kind": "finding.asserted",
   "payload": {
    "proposal_id": "vpr_be9c7dcdf52b3be5"
   },
   "reason": "Manual finding added to frontier state",
   "schema": "vela.event.v0.1",
   "target": {
    "id": "vf_cc50639072ba1867",
    "type": "finding"
   },
   "timestamp": "2026-06-10T06:50:55.929430+00:00"
  },
  {
   "k": 10,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:779efff7bd8306e55dad742c2e81e7698b607618b738b20076479d4fe69a92f6",
   "before_hash": "sha256:null",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "id": "vev_4869af225af70848",
   "kind": "finding.asserted",
   "payload": {
    "proposal_id": "vpr_adea2a2f9e4ba533"
   },
   "reason": "Manual finding added to frontier state",
   "schema": "vela.event.v0.1",
   "target": {
    "id": "vf_41030d44f59eae22",
    "type": "finding"
   },
   "timestamp": "2026-06-10T06:50:55.945317+00:00"
  },
  {
   "k": 11,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:19784465db1faced9eaf4c4ad740766cd21a1fc1cb074184e28c48be67012df2",
   "before_hash": "sha256:null",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "id": "vev_07e40e45981061ad",
   "kind": "finding.asserted",
   "payload": {
    "proposal_id": "vpr_c555bef607043399"
   },
   "reason": "Manual finding added to frontier state",
   "schema": "vela.event.v0.1",
   "target": {
    "id": "vf_8212daf3d7034a93",
    "type": "finding"
   },
   "timestamp": "2026-06-10T06:50:55.962213+00:00"
  },
  {
   "k": 12,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:d6602e34af1bbd326aaa60d719b3894e72baf49f8a109d28a75d22fa67718040",
   "before_hash": "sha256:fffbd83e2d04817fcbabfd6be3032da4b29f94be8dfdf0a440720c6c51dfe3ab",
   "caveats": [],
   "id": "vev_eb4221b9d34b54cd",
   "kind": "finding.noted",
   "payload": {
    "annotation_id": "ann_f1e498de0225245d",
    "proposal_id": "vpr_e4bebe83c4ec21eb",
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "schema": "vela.event.v0.1",
   "signature": "e6bd39218f11f2e83dfc2febac2be948c164e9a6a01686b58098458ea533dfb0c6cf7b50847284b3cb1cbc757d218885f9bff10f6e3db1b13a0d8ced07f2060e",
   "target": {
    "id": "vf_03776d6cd3e0801b",
    "type": "finding"
   },
   "timestamp": "2026-06-10T23:01:44.988028+00:00"
  },
  {
   "k": 13,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:dc3e8bcfca4d0bc96736bc4c6e0143b80c495ebc856badf8faa6a4906b27f11d",
   "before_hash": "sha256:ead759552534e5c6bdaa70c3a3849e984bec6c2a0ad042cbaf7cc7ea2c996bb3",
   "caveats": [],
   "id": "vev_5064c841d30e8aaf",
   "kind": "finding.noted",
   "payload": {
    "annotation_id": "ann_7d3dfff92186e52c",
    "proposal_id": "vpr_d1d96c52036f153d",
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "schema": "vela.event.v0.1",
   "signature": "8890fa3553ae65650001f72c12798c7954a608d4d54153fd79e057d9c963d7c04d4c9622120b4b9c214e77cf65919e92723b9cf12e299e89a31c836d9f109e09",
   "target": {
    "id": "vf_170c9a0e01a9b1d3",
    "type": "finding"
   },
   "timestamp": "2026-06-10T23:01:45.014955+00:00"
  },
  {
   "k": 14,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:3e9f8f484adc62300160e156029ae633ab1896f91478d3f45419a259375b0108",
   "before_hash": "sha256:ac87dcff1ebd974fcde4edfeb726aa72fca6b0146f00342156265bf6a1475f23",
   "caveats": [],
   "id": "vev_2e1f4d109d1a1f73",
   "kind": "finding.noted",
   "payload": {
    "annotation_id": "ann_b1f2b5969ee3887b",
    "proposal_id": "vpr_6a1b9f61788f93f0",
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "schema": "vela.event.v0.1",
   "signature": "6cf0d834948cd87a9c8abcf363d9dc5be16563f64be68b2e775873709e770c84d13734f00cb9404f331cd3b296f6a9389366868e8d491fc430e7f328b04b7d00",
   "target": {
    "id": "vf_368ec6ffb5747092",
    "type": "finding"
   },
   "timestamp": "2026-06-10T23:01:45.038565+00:00"
  },
  {
   "k": 15,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:dd9be8a780d7ab98fa9da4b1a8fc3819f88cfacdb73fda99c95fd9bfe6f825b7",
   "before_hash": "sha256:779efff7bd8306e55dad742c2e81e7698b607618b738b20076479d4fe69a92f6",
   "caveats": [],
   "id": "vev_a73023eb43fa7387",
   "kind": "finding.noted",
   "payload": {
    "annotation_id": "ann_ab3ecb7585481522",
    "proposal_id": "vpr_edef714318aa82be",
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "schema": "vela.event.v0.1",
   "signature": "66950a7d65e124765815f114e6edf6d88d0e8ac869c15e3c8524ad4e63d7bbea1b1db6edaaf772c30fbbff750aca351c60c9add96106600003edd4562f02b203",
   "target": {
    "id": "vf_41030d44f59eae22",
    "type": "finding"
   },
   "timestamp": "2026-06-10T23:01:45.061196+00:00"
  },
  {
   "k": 16,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:afc19db3f0499f215dc6915714b8c56e6c196407652fe2c01330f054cc3e22f9",
   "before_hash": "sha256:b381760051914d504474356d819f4209161578ecc5766a73786773482e49d131",
   "caveats": [],
   "id": "vev_804497a5a8fbe4a0",
   "kind": "finding.noted",
   "payload": {
    "annotation_id": "ann_a90a9a26863b2dc8",
    "proposal_id": "vpr_fb5a71c197133639",
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "schema": "vela.event.v0.1",
   "signature": "60a3587fe908ae2ccc956128e88ab599c7ba27cf7ecefbc623d6e24f657a823ace7a95387c5f673529fbf689b37e0e5b3e466143f64c2cb47e576519170d1507",
   "target": {
    "id": "vf_55068262f49df0ab",
    "type": "finding"
   },
   "timestamp": "2026-06-10T23:01:45.084566+00:00"
  },
  {
   "k": 17,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:c9ac9523835561c8c503f69fcb32fc7a395ecafd57f4be20a08fd2a9ea03da5a",
   "before_hash": "sha256:19784465db1faced9eaf4c4ad740766cd21a1fc1cb074184e28c48be67012df2",
   "caveats": [],
   "id": "vev_c4da9db8be63634e",
   "kind": "finding.noted",
   "payload": {
    "annotation_id": "ann_4fd1976cc05c9931",
    "proposal_id": "vpr_fd307d5d15c2cef7",
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "schema": "vela.event.v0.1",
   "signature": "1b0429e90bd8b606189c3ef5b6aea7875441eeaf9523c83888fd8ef9a3365909f4b4a02b574987f5c5c63f786074cd40b441a2c1f1d9d2967ac5d6a212513509",
   "target": {
    "id": "vf_8212daf3d7034a93",
    "type": "finding"
   },
   "timestamp": "2026-06-10T23:01:45.107670+00:00"
  },
  {
   "k": 18,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:5e386fa4faa589c0f4a8634a6e1cd93cb09d94198397d8fa8d9349547c8eae4c",
   "before_hash": "sha256:42db6392a66750c512302f7564b0937e6becfdd045048c59525795cf3131ff8a",
   "caveats": [],
   "id": "vev_4e2e2a5f25a8e28f",
   "kind": "finding.noted",
   "payload": {
    "annotation_id": "ann_bb601231fa4a954c",
    "proposal_id": "vpr_66758152772dd461",
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "schema": "vela.event.v0.1",
   "signature": "4a02f57d35951872e1991e39766340f916d27cc4f02ff873c88b7ee12bfd6c9a52f0ce4fc5f3567745c1cca7cfee1f23720691559b96060df88b056320ab4e04",
   "target": {
    "id": "vf_9a454a597ddee070",
    "type": "finding"
   },
   "timestamp": "2026-06-10T23:01:45.130467+00:00"
  },
  {
   "k": 19,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:6156e527d2c8c02dcc3c2a92446d26fdcf639a523672db04a65354831ac63cd5",
   "before_hash": "sha256:87ed48a736771700b582f05c238d84557330b6bacb0a293ccf227bb9649483e6",
   "caveats": [],
   "id": "vev_270eaf05963c65df",
   "kind": "finding.noted",
   "payload": {
    "annotation_id": "ann_47bd11245ae9c46a",
    "proposal_id": "vpr_74c27456b2783c8d",
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "schema": "vela.event.v0.1",
   "signature": "38c2842dab18b632d70d22f24b6017cfb0cbbb38901916614dbc797498211fb50a3d1d56d8eb030e472e91072b98099dfab561a238958d088c4e1332baab920a",
   "target": {
    "id": "vf_cc50639072ba1867",
    "type": "finding"
   },
   "timestamp": "2026-06-10T23:01:45.153703+00:00"
  },
  {
   "k": 20,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:3c19d8dd5f0a90516b0bff3d5cef27a5e776b4fb1db90438012b1e306adf8ca1",
   "before_hash": "sha256:7bfbae3a43915a84470520c2e45c24541de96eb8c25ea257d50f1556e981f476",
   "caveats": [],
   "id": "vev_f032a45ff0886024",
   "kind": "finding.noted",
   "payload": {
    "annotation_id": "ann_02fa69957d8c96a0",
    "proposal_id": "vpr_2ad76a3dce783d96",
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "schema": "vela.event.v0.1",
   "signature": "e87bf6797fd97acf3d5e656e0edccab8fbb3132978a736a9bdbdda27d7e2d46797e7699038a306f93202d0628f66a8e451a68a94093436685e1421e21223a802",
   "target": {
    "id": "vf_cf89ac0f36e62089",
    "type": "finding"
   },
   "timestamp": "2026-06-10T23:01:45.178012+00:00"
  },
  {
   "k": 21,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:0fdde94c6892fcc4dca4d05b4789bb0f237c79d7b0f5fae281c90637be1fbe30",
   "before_hash": "sha256:7241f04d549de6875e9bb7b04fa8425bf968050e83e2a57ad1694c4314377f86",
   "caveats": [],
   "id": "vev_c75d1f9984ddd3ab",
   "kind": "finding.noted",
   "payload": {
    "annotation_id": "ann_3a73e7715ffb9012",
    "proposal_id": "vpr_01ce4a9a77a73640",
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "schema": "vela.event.v0.1",
   "signature": "40026cd45b406b7b6ef56b0184c0ce4105ae60b0eaec0102fc503ff9f364ee3f2e29c38586d8fc3fd622478f17f66a6e197b6ece89828d53cc5482c467229e0a",
   "target": {
    "id": "vf_dce7a34adf2878f2",
    "type": "finding"
   },
   "timestamp": "2026-06-10T23:01:45.200404+00:00"
  },
  {
   "k": 22,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:05b5b00ff580fa84fe22a223425a03517ea00aa0aa59d4e8c5a71844b2fb9287",
   "before_hash": "sha256:bc813b05518c754e699699d8b8acefc309ae53a48e262f325c1fe21dc309a3c0",
   "caveats": [],
   "id": "vev_b11de7b18f8b9f24",
   "kind": "finding.noted",
   "payload": {
    "annotation_id": "ann_9452618fd0e9221d",
    "proposal_id": "vpr_9496dacae43645bc",
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "schema": "vela.event.v0.1",
   "signature": "904f09c9a4b8a60e06fd5cda29fc81055ee3d7c8ce89468c20bebbeca9c5d648b7ba084d61d42295dafd9a1700a6b5bf7d92d7e32e0e4ed647fe12db4010b10a",
   "target": {
    "id": "vf_ec4bb8feca206bf2",
    "type": "finding"
   },
   "timestamp": "2026-06-10T23:01:45.222264+00:00"
  },
  {
   "k": 23,
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "after_hash": "sha256:d02f91f588dc319a78e0f134eec5e6deafce383ae772693ec4cfb548e76e60d0",
   "before_hash": "sha256:6c12d8a2c4b3bf90ac2f228bb6c06af78c9063b42c2f01b1496dc6a10e1c18bd",
   "caveats": [],
   "id": "vev_d199cb2e417c4f42",
   "kind": "finding.noted",
   "payload": {
    "annotation_id": "ann_efa0109e17512289",
    "proposal_id": "vpr_cf4939a974d7a904",
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "schema": "vela.event.v0.1",
   "signature": "ea698a6d044a2ebe432bb7e4917bda2d6a27a06312ddfab32a538e7ef0857677798158675b96c5a92f83f6b32cfc31f41a562b908b5c09a57e243dc8d830bf02",
   "target": {
    "id": "vf_fec6f956d525e753",
    "type": "finding"
   },
   "timestamp": "2026-06-10T23:01:45.244978+00:00"
  }
 ],
 "findings": [
  {
   "annotations": [
    {
     "author": "reviewer:will-blair",
     "id": "ann_f1e498de0225245d",
     "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
     "timestamp": "2026-06-10T23:01:44.988028+00:00"
    }
   ],
   "assertion": {
    "direction": null,
    "entities": [],
    "relation": null,
    "text": "BENCHMARK CLAIM (ProteinGym) — ESM-1v REPORTS strong zero-shot substitution performance from masked-marginal scoring of a protein language model. VERIFICATION STATE: author-reported; weights public; depends on the scoring convention (masked-marginal vs wt-marginal) and the ProteinGym version. NOT re-run here. Open obligation: re-score the released model on the pinned v1.1 zero-shot substitution set.",
    "type": "computational"
   },
   "conditions": {
    "age_group": null,
    "cell_type": null,
    "clinical_trial": false,
    "concentration_range": null,
    "duration": null,
    "human_data": false,
    "in_vitro": false,
    "in_vivo": false,
    "species_unverified": [],
    "species_verified": [],
    "text": "Manually added finding; requires evidence review before scientific use."
   },
   "confidence": {
    "basis": "operator-supplied frontier prior; review required",
    "extraction_confidence": 1,
    "kind": "frontier_epistemic",
    "method": "expert_judgment",
    "score": 0.3
   },
   "created": "2026-06-10T06:50:55.899505+00:00",
   "evidence": {
    "effect_size": null,
    "evidence_spans": [],
    "method": "manual state transition",
    "model_system": "",
    "p_value": null,
    "replicated": false,
    "replication_count": null,
    "sample_size": null,
    "species": null,
    "type": "computational"
   },
   "flags": {
    "contested": false,
    "declining": false,
    "gap": true,
    "gravity_well": false,
    "negative_space": false,
    "retracted": false
   },
   "id": "vf_03776d6cd3e0801b",
   "links": [],
   "previous_version": null,
   "provenance": {
    "authors": [
     {
      "name": "reviewer:will-blair",
      "orcid": null
     }
    ],
    "citation_count": null,
    "doi": null,
    "extraction": {
     "extracted_at": "2026-06-10T06:50:55.899492+00:00",
     "extractor_version": "vela/0.691.0",
     "method": "manual_curation",
     "model": null,
     "model_version": null
    },
    "journal": null,
    "openalex_id": null,
    "pmc": null,
    "pmid": null,
    "review": {
     "corrections": [],
     "reviewed": false,
     "reviewed_at": null,
     "reviewer": null
    },
    "source_type": "expert_assertion",
    "title": "manual finding",
    "year": null
   },
   "updated": null,
   "version": 1
  },
  {
   "annotations": [
    {
     "author": "reviewer:will-blair",
     "id": "ann_7d3dfff92186e52c",
     "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
     "timestamp": "2026-06-10T23:01:45.014955+00:00"
    }
   ],
   "assertion": {
    "direction": null,
    "entities": [],
    "relation": null,
    "text": "BENCHMARK CLAIM (ProteinGym) — Tranception + retrieval (and TranceptEVE, combining Tranception with the EVE family model) REPORT leading zero-shot Spearman by mixing an autoregressive PLM with MSA-derived statistics. VERIFICATION STATE: author-reported; MSA-dependent, so the number moves with the alignment pipeline. NOT re-run here. Open obligation: reproduce with the stated MSAs and depth.",
    "type": "computational"
   },
   "conditions": {
    "age_group": null,
    "cell_type": null,
    "clinical_trial": false,
    "concentration_range": null,
    "duration": null,
    "human_data": false,
    "in_vitro": false,
    "in_vivo": false,
    "species_unverified": [],
    "species_verified": [],
    "text": "Manually added finding; requires evidence review before scientific use."
   },
   "confidence": {
    "basis": "operator-supplied frontier prior; review required",
    "extraction_confidence": 1,
    "kind": "frontier_epistemic",
    "method": "expert_judgment",
    "score": 0.3
   },
   "created": "2026-06-10T06:50:55.913757+00:00",
   "evidence": {
    "effect_size": null,
    "evidence_spans": [],
    "method": "manual state transition",
    "model_system": "",
    "p_value": null,
    "replicated": false,
    "replication_count": null,
    "sample_size": null,
    "species": null,
    "type": "computational"
   },
   "flags": {
    "contested": false,
    "declining": false,
    "gap": true,
    "gravity_well": false,
    "negative_space": false,
    "retracted": false
   },
   "id": "vf_170c9a0e01a9b1d3",
   "links": [],
   "previous_version": null,
   "provenance": {
    "authors": [
     {
      "name": "reviewer:will-blair",
      "orcid": null
     }
    ],
    "citation_count": null,
    "doi": null,
    "extraction": {
     "extracted_at": "2026-06-10T06:50:55.913744+00:00",
     "extractor_version": "vela/0.691.0",
     "method": "manual_curation",
     "model": null,
     "model_version": null
    },
    "journal": null,
    "openalex_id": null,
    "pmc": null,
    "pmid": null,
    "review": {
     "corrections": [],
     "reviewed": false,
     "reviewed_at": null,
     "reviewer": null
    },
    "source_type": "expert_assertion",
    "title": "manual finding",
    "year": null
   },
   "updated": null,
   "version": 1
  },
  {
   "annotations": [
    {
     "author": "reviewer:will-blair",
     "id": "ann_b1f2b5969ee3887b",
     "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
     "timestamp": "2026-06-10T23:01:45.038565+00:00"
    }
   ],
   "assertion": {
    "direction": null,
    "entities": [],
    "relation": null,
    "text": "BENCHMARK CLAIM (MiniF2F) — Draft-Sketch-Prove (DSP) REPORTS improved miniF2F-test pass by drafting an informal proof, sketching a formal skeleton, then closing gaps with an ATP. VERIFICATION STATE: author-reported; pipeline described; depends on the underlying ATP and the autoformalizer, both of which drift. NOT re-run here. Open obligation: reproduce with pinned ATP + LLM versions.",
    "type": "computational"
   },
   "conditions": {
    "age_group": null,
    "cell_type": null,
    "clinical_trial": false,
    "concentration_range": null,
    "duration": null,
    "human_data": false,
    "in_vitro": false,
    "in_vivo": false,
    "species_unverified": [],
    "species_verified": [],
    "text": "Manually added finding; requires evidence review before scientific use."
   },
   "confidence": {
    "basis": "operator-supplied frontier prior; review required",
    "extraction_confidence": 1,
    "kind": "frontier_epistemic",
    "method": "expert_judgment",
    "score": 0.3
   },
   "created": "2026-06-10T06:50:55.839703+00:00",
   "evidence": {
    "effect_size": null,
    "evidence_spans": [],
    "method": "manual state transition",
    "model_system": "",
    "p_value": null,
    "replicated": false,
    "replication_count": null,
    "sample_size": null,
    "species": null,
    "type": "computational"
   },
   "flags": {
    "contested": false,
    "declining": false,
    "gap": true,
    "gravity_well": false,
    "negative_space": false,
    "retracted": false
   },
   "id": "vf_368ec6ffb5747092",
   "links": [],
   "previous_version": null,
   "provenance": {
    "authors": [
     {
      "name": "reviewer:will-blair",
      "orcid": null
     }
    ],
    "citation_count": null,
    "doi": null,
    "extraction": {
     "extracted_at": "2026-06-10T06:50:55.839691+00:00",
     "extractor_version": "vela/0.691.0",
     "method": "manual_curation",
     "model": null,
     "model_version": null
    },
    "journal": null,
    "openalex_id": null,
    "pmc": null,
    "pmid": null,
    "review": {
     "corrections": [],
     "reviewed": false,
     "reviewed_at": null,
     "reviewer": null
    },
    "source_type": "expert_assertion",
    "title": "manual finding",
    "year": null
   },
   "updated": null,
   "version": 1
  },
  {
   "annotations": [
    {
     "author": "reviewer:will-blair",
     "id": "ann_ab3ecb7585481522",
     "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
     "timestamp": "2026-06-10T23:01:45.061196+00:00"
    }
   ],
   "assertion": {
    "direction": null,
    "entities": [],
    "relation": null,
    "text": "BENCHMARK CLAIM (ProteinGym) — ProteinNPT (non-parametric transformer, supervised track) REPORTS gains by attending across labelled neighbours. VERIFICATION STATE: author-reported; SUPERVISED — not comparable to zero-shot numbers; depends on the cross-validation split. NOT re-run here. Open obligation: re-run under the official supervised CV split; never compare against zero-shot rows.",
    "type": "computational"
   },
   "conditions": {
    "age_group": null,
    "cell_type": null,
    "clinical_trial": false,
    "concentration_range": null,
    "duration": null,
    "human_data": false,
    "in_vitro": false,
    "in_vivo": false,
    "species_unverified": [],
    "species_verified": [],
    "text": "Manually added finding; requires evidence review before scientific use."
   },
   "confidence": {
    "basis": "operator-supplied frontier prior; review required",
    "extraction_confidence": 1,
    "kind": "frontier_epistemic",
    "method": "expert_judgment",
    "score": 0.3
   },
   "created": "2026-06-10T06:50:55.944113+00:00",
   "evidence": {
    "effect_size": null,
    "evidence_spans": [],
    "method": "manual state transition",
    "model_system": "",
    "p_value": null,
    "replicated": false,
    "replication_count": null,
    "sample_size": null,
    "species": null,
    "type": "computational"
   },
   "flags": {
    "contested": false,
    "declining": false,
    "gap": true,
    "gravity_well": false,
    "negative_space": false,
    "retracted": false
   },
   "id": "vf_41030d44f59eae22",
   "links": [],
   "previous_version": null,
   "provenance": {
    "authors": [
     {
      "name": "reviewer:will-blair",
      "orcid": null
     }
    ],
    "citation_count": null,
    "doi": null,
    "extraction": {
     "extracted_at": "2026-06-10T06:50:55.944101+00:00",
     "extractor_version": "vela/0.691.0",
     "method": "manual_curation",
     "model": null,
     "model_version": null
    },
    "journal": null,
    "openalex_id": null,
    "pmc": null,
    "pmid": null,
    "review": {
     "corrections": [],
     "reviewed": false,
     "reviewed_at": null,
     "reviewer": null
    },
    "source_type": "expert_assertion",
    "title": "manual finding",
    "year": null
   },
   "updated": null,
   "version": 1
  },
  {
   "annotations": [
    {
     "author": "reviewer:will-blair",
     "id": "ann_a90a9a26863b2dc8",
     "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
     "timestamp": "2026-06-10T23:01:45.084566+00:00"
    }
   ],
   "assertion": {
    "direction": null,
    "entities": [],
    "relation": null,
    "text": "BENCHMARK CLAIM (MiniF2F) — DeepSeek-Prover-V1.5 REPORTS a leading miniF2F-test pass rate under a large sampling budget (RMaxTS). VERIFICATION STATE: author-reported; model weights public; eval harness in the paper; dataset version = the team's stated split. NOT independently re-run in this frontier. Open obligation: pin the split, re-run the released checkpoint, audit train/test contamination of the formal statements.",
    "type": "computational"
   },
   "conditions": {
    "age_group": null,
    "cell_type": null,
    "clinical_trial": false,
    "concentration_range": null,
    "duration": null,
    "human_data": false,
    "in_vitro": false,
    "in_vivo": false,
    "species_unverified": [],
    "species_verified": [],
    "text": "Manually added finding; requires evidence review before scientific use."
   },
   "confidence": {
    "basis": "operator-supplied frontier prior; review required",
    "extraction_confidence": 1,
    "kind": "frontier_epistemic",
    "method": "expert_judgment",
    "score": 0.3
   },
   "created": "2026-06-10T06:50:55.829210+00:00",
   "evidence": {
    "effect_size": null,
    "evidence_spans": [],
    "method": "manual state transition",
    "model_system": "",
    "p_value": null,
    "replicated": false,
    "replication_count": null,
    "sample_size": null,
    "species": null,
    "type": "computational"
   },
   "flags": {
    "contested": false,
    "declining": false,
    "gap": true,
    "gravity_well": false,
    "negative_space": false,
    "retracted": false
   },
   "id": "vf_55068262f49df0ab",
   "links": [],
   "previous_version": null,
   "provenance": {
    "authors": [
     {
      "name": "reviewer:will-blair",
      "orcid": null
     }
    ],
    "citation_count": null,
    "doi": null,
    "extraction": {
     "extracted_at": "2026-06-10T06:50:55.829198+00:00",
     "extractor_version": "vela/0.691.0",
     "method": "manual_curation",
     "model": null,
     "model_version": null
    },
    "journal": null,
    "openalex_id": null,
    "pmc": null,
    "pmid": null,
    "review": {
     "corrections": [],
     "reviewed": false,
     "reviewed_at": null,
     "reviewer": null
    },
    "source_type": "expert_assertion",
    "title": "manual finding",
    "year": null
   },
   "updated": null,
   "version": 1
  },
  {
   "annotations": [
    {
     "author": "reviewer:will-blair",
     "id": "ann_4fd1976cc05c9931",
     "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
     "timestamp": "2026-06-10T23:01:45.107670+00:00"
    }
   ],
   "assertion": {
    "direction": null,
    "entities": [],
    "relation": null,
    "text": "LEAKAGE HAZARD (ProteinGym). PLMs trained on UniProt may have seen sequences related to the assay proteins; 'zero-shot' is zero-shot on the LABELS, not necessarily on the SEQUENCES. VERIFICATION STATE: training-sequence overlap with assay proteins is under-audited. Open obligation: a sequence-similarity leakage audit between each model's training set and the assay proteins before banking any SOTA claim.",
    "type": "computational"
   },
   "conditions": {
    "age_group": null,
    "cell_type": null,
    "clinical_trial": false,
    "concentration_range": null,
    "duration": null,
    "human_data": false,
    "in_vitro": false,
    "in_vivo": false,
    "species_unverified": [],
    "species_verified": [],
    "text": "Manually added finding; requires evidence review before scientific use."
   },
   "confidence": {
    "basis": "operator-supplied frontier prior; review required",
    "extraction_confidence": 1,
    "kind": "frontier_epistemic",
    "method": "expert_judgment",
    "score": 0.3
   },
   "created": "2026-06-10T06:50:55.960939+00:00",
   "evidence": {
    "effect_size": null,
    "evidence_spans": [],
    "method": "manual state transition",
    "model_system": "",
    "p_value": null,
    "replicated": false,
    "replication_count": null,
    "sample_size": null,
    "species": null,
    "type": "computational"
   },
   "flags": {
    "contested": false,
    "declining": false,
    "gap": true,
    "gravity_well": false,
    "negative_space": false,
    "retracted": false
   },
   "id": "vf_8212daf3d7034a93",
   "links": [],
   "previous_version": null,
   "provenance": {
    "authors": [
     {
      "name": "reviewer:will-blair",
      "orcid": null
     }
    ],
    "citation_count": null,
    "doi": null,
    "extraction": {
     "extracted_at": "2026-06-10T06:50:55.960927+00:00",
     "extractor_version": "vela/0.691.0",
     "method": "manual_curation",
     "model": null,
     "model_version": null
    },
    "journal": null,
    "openalex_id": null,
    "pmc": null,
    "pmid": null,
    "review": {
     "corrections": [],
     "reviewed": false,
     "reviewed_at": null,
     "reviewer": null
    },
    "source_type": "expert_assertion",
    "title": "manual finding",
    "year": null
   },
   "updated": null,
   "version": 1
  },
  {
   "annotations": [
    {
     "author": "reviewer:will-blair",
     "id": "ann_bb601231fa4a954c",
     "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
     "timestamp": "2026-06-10T23:01:45.130467+00:00"
    }
   ],
   "assertion": {
    "direction": null,
    "entities": [],
    "relation": null,
    "text": "BENCHMARK CLAIM (MiniF2F) — HyperTree Proof Search (HTPS, Lample et al.) REPORTS a miniF2F pass rate via learned best-first proof search. VERIFICATION STATE: author-reported; search budget and version-specific. NOT re-run here. Open obligation: re-run at the stated budget on a pinned split.",
    "type": "computational"
   },
   "conditions": {
    "age_group": null,
    "cell_type": null,
    "clinical_trial": false,
    "concentration_range": null,
    "duration": null,
    "human_data": false,
    "in_vitro": false,
    "in_vivo": false,
    "species_unverified": [],
    "species_verified": [],
    "text": "Manually added finding; requires evidence review before scientific use."
   },
   "confidence": {
    "basis": "operator-supplied frontier prior; review required",
    "extraction_confidence": 1,
    "kind": "frontier_epistemic",
    "method": "expert_judgment",
    "score": 0.3
   },
   "created": "2026-06-10T06:50:55.850689+00:00",
   "evidence": {
    "effect_size": null,
    "evidence_spans": [],
    "method": "manual state transition",
    "model_system": "",
    "p_value": null,
    "replicated": false,
    "replication_count": null,
    "sample_size": null,
    "species": null,
    "type": "computational"
   },
   "flags": {
    "contested": false,
    "declining": false,
    "gap": true,
    "gravity_well": false,
    "negative_space": false,
    "retracted": false
   },
   "id": "vf_9a454a597ddee070",
   "links": [],
   "previous_version": null,
   "provenance": {
    "authors": [
     {
      "name": "reviewer:will-blair",
      "orcid": null
     }
    ],
    "citation_count": null,
    "doi": null,
    "extraction": {
     "extracted_at": "2026-06-10T06:50:55.850676+00:00",
     "extractor_version": "vela/0.691.0",
     "method": "manual_curation",
     "model": null,
     "model_version": null
    },
    "journal": null,
    "openalex_id": null,
    "pmc": null,
    "pmid": null,
    "review": {
     "corrections": [],
     "reviewed": false,
     "reviewed_at": null,
     "reviewer": null
    },
    "source_type": "expert_assertion",
    "title": "manual finding",
    "year": null
   },
   "updated": null,
   "version": 1
  },
  {
   "annotations": [
    {
     "author": "reviewer:will-blair",
     "id": "ann_47bd11245ae9c46a",
     "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
     "timestamp": "2026-06-10T23:01:45.153703+00:00"
    }
   ],
   "assertion": {
    "direction": null,
    "entities": [],
    "relation": null,
    "text": "BENCHMARK CLAIM (ProteinGym) — EVE (evolutionary VAE over an MSA) REPORTS strong variant-effect prediction, especially for clinical variants. VERIFICATION STATE: author-reported; fully MSA-dependent; per-protein model fitting. NOT re-run here. Open obligation: re-fit on pinned MSAs and confirm the held-out assay Spearman.",
    "type": "computational"
   },
   "conditions": {
    "age_group": null,
    "cell_type": null,
    "clinical_trial": false,
    "concentration_range": null,
    "duration": null,
    "human_data": false,
    "in_vitro": false,
    "in_vivo": false,
    "species_unverified": [],
    "species_verified": [],
    "text": "Manually added finding; requires evidence review before scientific use."
   },
   "confidence": {
    "basis": "operator-supplied frontier prior; review required",
    "extraction_confidence": 1,
    "kind": "frontier_epistemic",
    "method": "expert_judgment",
    "score": 0.3
   },
   "created": "2026-06-10T06:50:55.928257+00:00",
   "evidence": {
    "effect_size": null,
    "evidence_spans": [],
    "method": "manual state transition",
    "model_system": "",
    "p_value": null,
    "replicated": false,
    "replication_count": null,
    "sample_size": null,
    "species": null,
    "type": "computational"
   },
   "flags": {
    "contested": false,
    "declining": false,
    "gap": true,
    "gravity_well": false,
    "negative_space": false,
    "retracted": false
   },
   "id": "vf_cc50639072ba1867",
   "links": [],
   "previous_version": null,
   "provenance": {
    "authors": [
     {
      "name": "reviewer:will-blair",
      "orcid": null
     }
    ],
    "citation_count": null,
    "doi": null,
    "extraction": {
     "extracted_at": "2026-06-10T06:50:55.928244+00:00",
     "extractor_version": "vela/0.691.0",
     "method": "manual_curation",
     "model": null,
     "model_version": null
    },
    "journal": null,
    "openalex_id": null,
    "pmc": null,
    "pmid": null,
    "review": {
     "corrections": [],
     "reviewed": false,
     "reviewed_at": null,
     "reviewer": null
    },
    "source_type": "expert_assertion",
    "title": "manual finding",
    "year": null
   },
   "updated": null,
   "version": 1
  },
  {
   "annotations": [
    {
     "author": "reviewer:will-blair",
     "id": "ann_02fa69957d8c96a0",
     "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
     "timestamp": "2026-06-10T23:01:45.178012+00:00"
    }
   ],
   "assertion": {
    "direction": null,
    "entities": [],
    "relation": null,
    "text": "BENCHMARK META (MiniF2F). MiniF2F is ~488 olympiad/textbook formal-math problems (AMC/AIME/IMO + MATH), ported to Lean/Isabelle/HOL-Light/Metamath, split valid/test. KNOWN TRUST ISSUE: multiple incompatible versions exist (original 2021, miniF2F-v2, and the 'miniF2F Revisited' cleanup with corrected/changed statements), so pass-rates across papers are version-ambiguous unless the exact split is pinned. STATE: dataset-version hazard, not a model claim.",
    "type": "computational"
   },
   "conditions": {
    "age_group": null,
    "cell_type": null,
    "clinical_trial": false,
    "concentration_range": null,
    "duration": null,
    "human_data": false,
    "in_vitro": false,
    "in_vivo": false,
    "species_unverified": [],
    "species_verified": [],
    "text": "Manually added finding; requires evidence review before scientific use."
   },
   "confidence": {
    "basis": "operator-supplied frontier prior; review required",
    "extraction_confidence": 1,
    "kind": "frontier_epistemic",
    "method": "expert_judgment",
    "score": 0.3
   },
   "created": "2026-06-10T06:50:55.819124+00:00",
   "evidence": {
    "effect_size": null,
    "evidence_spans": [],
    "method": "manual state transition",
    "model_system": "",
    "p_value": null,
    "replicated": false,
    "replication_count": null,
    "sample_size": null,
    "species": null,
    "type": "computational"
   },
   "flags": {
    "contested": false,
    "declining": false,
    "gap": true,
    "gravity_well": false,
    "negative_space": false,
    "retracted": false
   },
   "id": "vf_cf89ac0f36e62089",
   "links": [],
   "previous_version": null,
   "provenance": {
    "authors": [
     {
      "name": "reviewer:will-blair",
      "orcid": null
     }
    ],
    "citation_count": null,
    "doi": null,
    "extraction": {
     "extracted_at": "2026-06-10T06:50:55.819111+00:00",
     "extractor_version": "vela/0.691.0",
     "method": "manual_curation",
     "model": null,
     "model_version": null
    },
    "journal": null,
    "openalex_id": null,
    "pmc": null,
    "pmid": null,
    "review": {
     "corrections": [],
     "reviewed": false,
     "reviewed_at": null,
     "reviewer": null
    },
    "source_type": "expert_assertion",
    "title": "manual finding",
    "year": null
   },
   "updated": null,
   "version": 1
  },
  {
   "annotations": [
    {
     "author": "reviewer:will-blair",
     "id": "ann_3a73e7715ffb9012",
     "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
     "timestamp": "2026-06-10T23:01:45.200404+00:00"
    }
   ],
   "assertion": {
    "direction": null,
    "entities": [],
    "relation": null,
    "text": "FAITHFULNESS HAZARD (MiniF2F). A reported 'solve' is only as good as the autoformalized statement matching the intended problem; the miniF2F Revisited effort found statements that were mis-stated or trivially true. VERIFICATION STATE: faithfulness of the FORMAL statement to the INFORMAL problem is the under-checked axis. Open obligation: every banked miniF2F solve needs a statement-faithfulness attestation (vela attest --scope formalism-fidelity).",
    "type": "computational"
   },
   "conditions": {
    "age_group": null,
    "cell_type": null,
    "clinical_trial": false,
    "concentration_range": null,
    "duration": null,
    "human_data": false,
    "in_vitro": false,
    "in_vivo": false,
    "species_unverified": [],
    "species_verified": [],
    "text": "Manually added finding; requires evidence review before scientific use."
   },
   "confidence": {
    "basis": "operator-supplied frontier prior; review required",
    "extraction_confidence": 1,
    "kind": "frontier_epistemic",
    "method": "expert_judgment",
    "score": 0.3
   },
   "created": "2026-06-10T06:50:55.873832+00:00",
   "evidence": {
    "effect_size": null,
    "evidence_spans": [],
    "method": "manual state transition",
    "model_system": "",
    "p_value": null,
    "replicated": false,
    "replication_count": null,
    "sample_size": null,
    "species": null,
    "type": "computational"
   },
   "flags": {
    "contested": false,
    "declining": false,
    "gap": true,
    "gravity_well": false,
    "negative_space": false,
    "retracted": false
   },
   "id": "vf_dce7a34adf2878f2",
   "links": [],
   "previous_version": null,
   "provenance": {
    "authors": [
     {
      "name": "reviewer:will-blair",
      "orcid": null
     }
    ],
    "citation_count": null,
    "doi": null,
    "extraction": {
     "extracted_at": "2026-06-10T06:50:55.873820+00:00",
     "extractor_version": "vela/0.691.0",
     "method": "manual_curation",
     "model": null,
     "model_version": null
    },
    "journal": null,
    "openalex_id": null,
    "pmc": null,
    "pmid": null,
    "review": {
     "corrections": [],
     "reviewed": false,
     "reviewed_at": null,
     "reviewer": null
    },
    "source_type": "expert_assertion",
    "title": "manual finding",
    "year": null
   },
   "updated": null,
   "version": 1
  },
  {
   "annotations": [
    {
     "author": "reviewer:will-blair",
     "id": "ann_9452618fd0e9221d",
     "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
     "timestamp": "2026-06-10T23:01:45.222264+00:00"
    }
   ],
   "assertion": {
    "direction": null,
    "entities": [],
    "relation": null,
    "text": "BENCHMARK META (ProteinGym). ProteinGym benchmarks variant-effect prediction against deep mutational scanning (DMS) assays: a substitution benchmark (~217 assays) and an indel benchmark, with zero-shot and supervised tracks, scored by Spearman correlation (and AUC/MCC). KNOWN TRUST ISSUE: v1.0 vs v1.1 differ in assay set and splits; zero-shot vs supervised numbers are not comparable; MSA-dependent methods vary with the MSA pipeline. STATE: dataset-version + track-conflation hazard.",
    "type": "computational"
   },
   "conditions": {
    "age_group": null,
    "cell_type": null,
    "clinical_trial": false,
    "concentration_range": null,
    "duration": null,
    "human_data": false,
    "in_vitro": false,
    "in_vivo": false,
    "species_unverified": [],
    "species_verified": [],
    "text": "Manually added finding; requires evidence review before scientific use."
   },
   "confidence": {
    "basis": "operator-supplied frontier prior; review required",
    "extraction_confidence": 1,
    "kind": "frontier_epistemic",
    "method": "expert_judgment",
    "score": 0.3
   },
   "created": "2026-06-10T06:50:55.886459+00:00",
   "evidence": {
    "effect_size": null,
    "evidence_spans": [],
    "method": "manual state transition",
    "model_system": "",
    "p_value": null,
    "replicated": false,
    "replication_count": null,
    "sample_size": null,
    "species": null,
    "type": "computational"
   },
   "flags": {
    "contested": false,
    "declining": false,
    "gap": true,
    "gravity_well": false,
    "negative_space": false,
    "retracted": false
   },
   "id": "vf_ec4bb8feca206bf2",
   "links": [],
   "previous_version": null,
   "provenance": {
    "authors": [
     {
      "name": "reviewer:will-blair",
      "orcid": null
     }
    ],
    "citation_count": null,
    "doi": null,
    "extraction": {
     "extracted_at": "2026-06-10T06:50:55.886449+00:00",
     "extractor_version": "vela/0.691.0",
     "method": "manual_curation",
     "model": null,
     "model_version": null
    },
    "journal": null,
    "openalex_id": null,
    "pmc": null,
    "pmid": null,
    "review": {
     "corrections": [],
     "reviewed": false,
     "reviewed_at": null,
     "reviewer": null
    },
    "source_type": "expert_assertion",
    "title": "manual finding",
    "year": null
   },
   "updated": null,
   "version": 1
  },
  {
   "annotations": [
    {
     "author": "reviewer:will-blair",
     "id": "ann_efa0109e17512289",
     "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
     "timestamp": "2026-06-10T23:01:45.244978+00:00"
    }
   ],
   "assertion": {
    "direction": null,
    "entities": [],
    "relation": null,
    "text": "BENCHMARK CLAIM (MiniF2F) — AlphaProof (DeepMind) solved several IMO-2024 problems formally in Lean; this is sometimes conflated with a miniF2F number. VERIFICATION STATE: the IMO result is a separate, time-bounded competition claim, NOT a miniF2F-test pass rate; no public checkpoint. Open obligation: do not record an AlphaProof miniF2F figure without a cited, pinned evaluation.",
    "type": "computational"
   },
   "conditions": {
    "age_group": null,
    "cell_type": null,
    "clinical_trial": false,
    "concentration_range": null,
    "duration": null,
    "human_data": false,
    "in_vitro": false,
    "in_vivo": false,
    "species_unverified": [],
    "species_verified": [],
    "text": "Manually added finding; requires evidence review before scientific use."
   },
   "confidence": {
    "basis": "operator-supplied frontier prior; review required",
    "extraction_confidence": 1,
    "kind": "frontier_epistemic",
    "method": "expert_judgment",
    "score": 0.3
   },
   "created": "2026-06-10T06:50:55.862148+00:00",
   "evidence": {
    "effect_size": null,
    "evidence_spans": [],
    "method": "manual state transition",
    "model_system": "",
    "p_value": null,
    "replicated": false,
    "replication_count": null,
    "sample_size": null,
    "species": null,
    "type": "computational"
   },
   "flags": {
    "contested": false,
    "declining": false,
    "gap": true,
    "gravity_well": false,
    "negative_space": false,
    "retracted": false
   },
   "id": "vf_fec6f956d525e753",
   "links": [],
   "previous_version": null,
   "provenance": {
    "authors": [
     {
      "name": "reviewer:will-blair",
      "orcid": null
     }
    ],
    "citation_count": null,
    "doi": null,
    "extraction": {
     "extracted_at": "2026-06-10T06:50:55.862137+00:00",
     "extractor_version": "vela/0.691.0",
     "method": "manual_curation",
     "model": null,
     "model_version": null
    },
    "journal": null,
    "openalex_id": null,
    "pmc": null,
    "pmid": null,
    "review": {
     "corrections": [],
     "reviewed": false,
     "reviewed_at": null,
     "reviewer": null
    },
    "source_type": "expert_assertion",
    "title": "manual finding",
    "year": null
   },
   "updated": null,
   "version": 1
  }
 ],
 "proposals": [
  {
   "actor": {
    "id": "agent:hardening-2026-06-10",
    "type": "human"
   },
   "applied_event_id": "vev_c75d1f9984ddd3ab",
   "caveats": [],
   "created_at": "2026-06-10T22:35:36.038393+00:00",
   "decision_reason": "Hardening: label provenance is attested (records-not-reruns); caps below verified until a deterministic rederivation exists",
   "id": "vpr_01ce4a9a77a73640",
   "kind": "finding.note",
   "payload": {
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "reviewed_at": "2026-06-10T23:01:45.200385+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_dce7a34adf2878f2",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "applied_event_id": "vev_39ad7234d713069a",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "created_at": "2026-06-10T06:50:55.913795+00:00",
   "decision_reason": "Applied locally from proposal creation",
   "id": "vpr_212f7910ba3adf91",
   "kind": "finding.add",
   "payload": {
    "finding": {
     "assertion": {
      "direction": null,
      "entities": [],
      "relation": null,
      "text": "BENCHMARK CLAIM (ProteinGym) — Tranception + retrieval (and TranceptEVE, combining Tranception with the EVE family model) REPORT leading zero-shot Spearman by mixing an autoregressive PLM with MSA-derived statistics. VERIFICATION STATE: author-reported; MSA-dependent, so the number moves with the alignment pipeline. NOT re-run here. Open obligation: reproduce with the stated MSAs and depth.",
      "type": "computational"
     },
     "conditions": {
      "age_group": null,
      "cell_type": null,
      "clinical_trial": false,
      "concentration_range": null,
      "duration": null,
      "human_data": false,
      "in_vitro": false,
      "in_vivo": false,
      "species_unverified": [],
      "species_verified": [],
      "text": "Manually added finding; requires evidence review before scientific use."
     },
     "confidence": {
      "basis": "operator-supplied frontier prior; review required",
      "extraction_confidence": 1,
      "kind": "frontier_epistemic",
      "method": "expert_judgment",
      "score": 0.3
     },
     "created": "2026-06-10T06:50:55.913757+00:00",
     "evidence": {
      "effect_size": null,
      "evidence_spans": [],
      "method": "manual state transition",
      "model_system": "",
      "p_value": null,
      "replicated": false,
      "replication_count": null,
      "sample_size": null,
      "species": null,
      "type": "computational"
     },
     "flags": {
      "contested": false,
      "declining": false,
      "gap": true,
      "gravity_well": false,
      "negative_space": false,
      "retracted": false
     },
     "id": "vf_170c9a0e01a9b1d3",
     "links": [],
     "previous_version": null,
     "provenance": {
      "authors": [
       {
        "name": "reviewer:will-blair",
        "orcid": null
       }
      ],
      "citation_count": null,
      "doi": null,
      "extraction": {
       "extracted_at": "2026-06-10T06:50:55.913744+00:00",
       "extractor_version": "vela/0.691.0",
       "method": "manual_curation",
       "model": null,
       "model_version": null
      },
      "journal": null,
      "openalex_id": null,
      "pmc": null,
      "pmid": null,
      "review": {
       "corrections": [],
       "reviewed": false,
       "reviewed_at": null,
       "reviewer": null
      },
      "source_type": "expert_assertion",
      "title": "manual finding",
      "year": null
     },
     "updated": null,
     "version": 1
    }
   },
   "reason": "Manual finding added to frontier state",
   "reviewed_at": "2026-06-10T06:50:55.914821+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_170c9a0e01a9b1d3",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "agent:hardening-2026-06-10",
    "type": "human"
   },
   "applied_event_id": "vev_f032a45ff0886024",
   "caveats": [],
   "created_at": "2026-06-10T22:35:36.017836+00:00",
   "decision_reason": "Hardening: label provenance is attested (records-not-reruns); caps below verified until a deterministic rederivation exists",
   "id": "vpr_2ad76a3dce783d96",
   "kind": "finding.note",
   "payload": {
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "reviewed_at": "2026-06-10T23:01:45.177995+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_cf89ac0f36e62089",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "agent:hardening-2026-06-10",
    "type": "human"
   },
   "applied_event_id": "vev_4e2e2a5f25a8e28f",
   "caveats": [],
   "created_at": "2026-06-10T22:35:35.978629+00:00",
   "decision_reason": "Hardening: label provenance is attested (records-not-reruns); caps below verified until a deterministic rederivation exists",
   "id": "vpr_66758152772dd461",
   "kind": "finding.note",
   "payload": {
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "reviewed_at": "2026-06-10T23:01:45.130450+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_9a454a597ddee070",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "applied_event_id": "vev_fc4e6c758136cecd",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "created_at": "2026-06-10T06:50:55.862181+00:00",
   "decision_reason": "Applied locally from proposal creation",
   "id": "vpr_684dfb8e796321d2",
   "kind": "finding.add",
   "payload": {
    "finding": {
     "assertion": {
      "direction": null,
      "entities": [],
      "relation": null,
      "text": "BENCHMARK CLAIM (MiniF2F) — AlphaProof (DeepMind) solved several IMO-2024 problems formally in Lean; this is sometimes conflated with a miniF2F number. VERIFICATION STATE: the IMO result is a separate, time-bounded competition claim, NOT a miniF2F-test pass rate; no public checkpoint. Open obligation: do not record an AlphaProof miniF2F figure without a cited, pinned evaluation.",
      "type": "computational"
     },
     "conditions": {
      "age_group": null,
      "cell_type": null,
      "clinical_trial": false,
      "concentration_range": null,
      "duration": null,
      "human_data": false,
      "in_vitro": false,
      "in_vivo": false,
      "species_unverified": [],
      "species_verified": [],
      "text": "Manually added finding; requires evidence review before scientific use."
     },
     "confidence": {
      "basis": "operator-supplied frontier prior; review required",
      "extraction_confidence": 1,
      "kind": "frontier_epistemic",
      "method": "expert_judgment",
      "score": 0.3
     },
     "created": "2026-06-10T06:50:55.862148+00:00",
     "evidence": {
      "effect_size": null,
      "evidence_spans": [],
      "method": "manual state transition",
      "model_system": "",
      "p_value": null,
      "replicated": false,
      "replication_count": null,
      "sample_size": null,
      "species": null,
      "type": "computational"
     },
     "flags": {
      "contested": false,
      "declining": false,
      "gap": true,
      "gravity_well": false,
      "negative_space": false,
      "retracted": false
     },
     "id": "vf_fec6f956d525e753",
     "links": [],
     "previous_version": null,
     "provenance": {
      "authors": [
       {
        "name": "reviewer:will-blair",
        "orcid": null
       }
      ],
      "citation_count": null,
      "doi": null,
      "extraction": {
       "extracted_at": "2026-06-10T06:50:55.862137+00:00",
       "extractor_version": "vela/0.691.0",
       "method": "manual_curation",
       "model": null,
       "model_version": null
      },
      "journal": null,
      "openalex_id": null,
      "pmc": null,
      "pmid": null,
      "review": {
       "corrections": [],
       "reviewed": false,
       "reviewed_at": null,
       "reviewer": null
      },
      "source_type": "expert_assertion",
      "title": "manual finding",
      "year": null
     },
     "updated": null,
     "version": 1
    }
   },
   "reason": "Manual finding added to frontier state",
   "reviewed_at": "2026-06-10T06:50:55.862907+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_fec6f956d525e753",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "agent:hardening-2026-06-10",
    "type": "human"
   },
   "applied_event_id": "vev_2e1f4d109d1a1f73",
   "caveats": [],
   "created_at": "2026-06-10T22:35:35.899729+00:00",
   "decision_reason": "Hardening: label provenance is attested (records-not-reruns); caps below verified until a deterministic rederivation exists",
   "id": "vpr_6a1b9f61788f93f0",
   "kind": "finding.note",
   "payload": {
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "reviewed_at": "2026-06-10T23:01:45.038546+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_368ec6ffb5747092",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "agent:hardening-2026-06-10",
    "type": "human"
   },
   "applied_event_id": "vev_270eaf05963c65df",
   "caveats": [],
   "created_at": "2026-06-10T22:35:35.998936+00:00",
   "decision_reason": "Hardening: label provenance is attested (records-not-reruns); caps below verified until a deterministic rederivation exists",
   "id": "vpr_74c27456b2783c8d",
   "kind": "finding.note",
   "payload": {
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "reviewed_at": "2026-06-10T23:01:45.153683+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_cc50639072ba1867",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "applied_event_id": "vev_bb98a228e1f4a5a7",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "created_at": "2026-06-10T06:50:55.899544+00:00",
   "decision_reason": "Applied locally from proposal creation",
   "id": "vpr_7d99f50cb897566d",
   "kind": "finding.add",
   "payload": {
    "finding": {
     "assertion": {
      "direction": null,
      "entities": [],
      "relation": null,
      "text": "BENCHMARK CLAIM (ProteinGym) — ESM-1v REPORTS strong zero-shot substitution performance from masked-marginal scoring of a protein language model. VERIFICATION STATE: author-reported; weights public; depends on the scoring convention (masked-marginal vs wt-marginal) and the ProteinGym version. NOT re-run here. Open obligation: re-score the released model on the pinned v1.1 zero-shot substitution set.",
      "type": "computational"
     },
     "conditions": {
      "age_group": null,
      "cell_type": null,
      "clinical_trial": false,
      "concentration_range": null,
      "duration": null,
      "human_data": false,
      "in_vitro": false,
      "in_vivo": false,
      "species_unverified": [],
      "species_verified": [],
      "text": "Manually added finding; requires evidence review before scientific use."
     },
     "confidence": {
      "basis": "operator-supplied frontier prior; review required",
      "extraction_confidence": 1,
      "kind": "frontier_epistemic",
      "method": "expert_judgment",
      "score": 0.3
     },
     "created": "2026-06-10T06:50:55.899505+00:00",
     "evidence": {
      "effect_size": null,
      "evidence_spans": [],
      "method": "manual state transition",
      "model_system": "",
      "p_value": null,
      "replicated": false,
      "replication_count": null,
      "sample_size": null,
      "species": null,
      "type": "computational"
     },
     "flags": {
      "contested": false,
      "declining": false,
      "gap": true,
      "gravity_well": false,
      "negative_space": false,
      "retracted": false
     },
     "id": "vf_03776d6cd3e0801b",
     "links": [],
     "previous_version": null,
     "provenance": {
      "authors": [
       {
        "name": "reviewer:will-blair",
        "orcid": null
       }
      ],
      "citation_count": null,
      "doi": null,
      "extraction": {
       "extracted_at": "2026-06-10T06:50:55.899492+00:00",
       "extractor_version": "vela/0.691.0",
       "method": "manual_curation",
       "model": null,
       "model_version": null
      },
      "journal": null,
      "openalex_id": null,
      "pmc": null,
      "pmid": null,
      "review": {
       "corrections": [],
       "reviewed": false,
       "reviewed_at": null,
       "reviewer": null
      },
      "source_type": "expert_assertion",
      "title": "manual finding",
      "year": null
     },
     "updated": null,
     "version": 1
    }
   },
   "reason": "Manual finding added to frontier state",
   "reviewed_at": "2026-06-10T06:50:55.900540+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_03776d6cd3e0801b",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "applied_event_id": "vev_03b2b7f5e7e0be96",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "created_at": "2026-06-10T06:50:55.850727+00:00",
   "decision_reason": "Applied locally from proposal creation",
   "id": "vpr_8ebb01be4aedad3b",
   "kind": "finding.add",
   "payload": {
    "finding": {
     "assertion": {
      "direction": null,
      "entities": [],
      "relation": null,
      "text": "BENCHMARK CLAIM (MiniF2F) — HyperTree Proof Search (HTPS, Lample et al.) REPORTS a miniF2F pass rate via learned best-first proof search. VERIFICATION STATE: author-reported; search budget and version-specific. NOT re-run here. Open obligation: re-run at the stated budget on a pinned split.",
      "type": "computational"
     },
     "conditions": {
      "age_group": null,
      "cell_type": null,
      "clinical_trial": false,
      "concentration_range": null,
      "duration": null,
      "human_data": false,
      "in_vitro": false,
      "in_vivo": false,
      "species_unverified": [],
      "species_verified": [],
      "text": "Manually added finding; requires evidence review before scientific use."
     },
     "confidence": {
      "basis": "operator-supplied frontier prior; review required",
      "extraction_confidence": 1,
      "kind": "frontier_epistemic",
      "method": "expert_judgment",
      "score": 0.3
     },
     "created": "2026-06-10T06:50:55.850689+00:00",
     "evidence": {
      "effect_size": null,
      "evidence_spans": [],
      "method": "manual state transition",
      "model_system": "",
      "p_value": null,
      "replicated": false,
      "replication_count": null,
      "sample_size": null,
      "species": null,
      "type": "computational"
     },
     "flags": {
      "contested": false,
      "declining": false,
      "gap": true,
      "gravity_well": false,
      "negative_space": false,
      "retracted": false
     },
     "id": "vf_9a454a597ddee070",
     "links": [],
     "previous_version": null,
     "provenance": {
      "authors": [
       {
        "name": "reviewer:will-blair",
        "orcid": null
       }
      ],
      "citation_count": null,
      "doi": null,
      "extraction": {
       "extracted_at": "2026-06-10T06:50:55.850676+00:00",
       "extractor_version": "vela/0.691.0",
       "method": "manual_curation",
       "model": null,
       "model_version": null
      },
      "journal": null,
      "openalex_id": null,
      "pmc": null,
      "pmid": null,
      "review": {
       "corrections": [],
       "reviewed": false,
       "reviewed_at": null,
       "reviewer": null
      },
      "source_type": "expert_assertion",
      "title": "manual finding",
      "year": null
     },
     "updated": null,
     "version": 1
    }
   },
   "reason": "Manual finding added to frontier state",
   "reviewed_at": "2026-06-10T06:50:55.851472+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_9a454a597ddee070",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "agent:hardening-2026-06-10",
    "type": "human"
   },
   "applied_event_id": "vev_b11de7b18f8b9f24",
   "caveats": [],
   "created_at": "2026-06-10T22:35:36.056745+00:00",
   "decision_reason": "Hardening: label provenance is attested (records-not-reruns); caps below verified until a deterministic rederivation exists",
   "id": "vpr_9496dacae43645bc",
   "kind": "finding.note",
   "payload": {
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "reviewed_at": "2026-06-10T23:01:45.222248+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_ec4bb8feca206bf2",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "applied_event_id": "vev_31b40ef5e25c88b6",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "created_at": "2026-06-10T06:50:55.873867+00:00",
   "decision_reason": "Applied locally from proposal creation",
   "id": "vpr_965bbdde5ff53044",
   "kind": "finding.add",
   "payload": {
    "finding": {
     "assertion": {
      "direction": null,
      "entities": [],
      "relation": null,
      "text": "FAITHFULNESS HAZARD (MiniF2F). A reported 'solve' is only as good as the autoformalized statement matching the intended problem; the miniF2F Revisited effort found statements that were mis-stated or trivially true. VERIFICATION STATE: faithfulness of the FORMAL statement to the INFORMAL problem is the under-checked axis. Open obligation: every banked miniF2F solve needs a statement-faithfulness attestation (vela attest --scope formalism-fidelity).",
      "type": "computational"
     },
     "conditions": {
      "age_group": null,
      "cell_type": null,
      "clinical_trial": false,
      "concentration_range": null,
      "duration": null,
      "human_data": false,
      "in_vitro": false,
      "in_vivo": false,
      "species_unverified": [],
      "species_verified": [],
      "text": "Manually added finding; requires evidence review before scientific use."
     },
     "confidence": {
      "basis": "operator-supplied frontier prior; review required",
      "extraction_confidence": 1,
      "kind": "frontier_epistemic",
      "method": "expert_judgment",
      "score": 0.3
     },
     "created": "2026-06-10T06:50:55.873832+00:00",
     "evidence": {
      "effect_size": null,
      "evidence_spans": [],
      "method": "manual state transition",
      "model_system": "",
      "p_value": null,
      "replicated": false,
      "replication_count": null,
      "sample_size": null,
      "species": null,
      "type": "computational"
     },
     "flags": {
      "contested": false,
      "declining": false,
      "gap": true,
      "gravity_well": false,
      "negative_space": false,
      "retracted": false
     },
     "id": "vf_dce7a34adf2878f2",
     "links": [],
     "previous_version": null,
     "provenance": {
      "authors": [
       {
        "name": "reviewer:will-blair",
        "orcid": null
       }
      ],
      "citation_count": null,
      "doi": null,
      "extraction": {
       "extracted_at": "2026-06-10T06:50:55.873820+00:00",
       "extractor_version": "vela/0.691.0",
       "method": "manual_curation",
       "model": null,
       "model_version": null
      },
      "journal": null,
      "openalex_id": null,
      "pmc": null,
      "pmid": null,
      "review": {
       "corrections": [],
       "reviewed": false,
       "reviewed_at": null,
       "reviewer": null
      },
      "source_type": "expert_assertion",
      "title": "manual finding",
      "year": null
     },
     "updated": null,
     "version": 1
    }
   },
   "reason": "Manual finding added to frontier state",
   "reviewed_at": "2026-06-10T06:50:55.874673+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_dce7a34adf2878f2",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "applied_event_id": "vev_4869af225af70848",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "created_at": "2026-06-10T06:50:55.944152+00:00",
   "decision_reason": "Applied locally from proposal creation",
   "id": "vpr_adea2a2f9e4ba533",
   "kind": "finding.add",
   "payload": {
    "finding": {
     "assertion": {
      "direction": null,
      "entities": [],
      "relation": null,
      "text": "BENCHMARK CLAIM (ProteinGym) — ProteinNPT (non-parametric transformer, supervised track) REPORTS gains by attending across labelled neighbours. VERIFICATION STATE: author-reported; SUPERVISED — not comparable to zero-shot numbers; depends on the cross-validation split. NOT re-run here. Open obligation: re-run under the official supervised CV split; never compare against zero-shot rows.",
      "type": "computational"
     },
     "conditions": {
      "age_group": null,
      "cell_type": null,
      "clinical_trial": false,
      "concentration_range": null,
      "duration": null,
      "human_data": false,
      "in_vitro": false,
      "in_vivo": false,
      "species_unverified": [],
      "species_verified": [],
      "text": "Manually added finding; requires evidence review before scientific use."
     },
     "confidence": {
      "basis": "operator-supplied frontier prior; review required",
      "extraction_confidence": 1,
      "kind": "frontier_epistemic",
      "method": "expert_judgment",
      "score": 0.3
     },
     "created": "2026-06-10T06:50:55.944113+00:00",
     "evidence": {
      "effect_size": null,
      "evidence_spans": [],
      "method": "manual state transition",
      "model_system": "",
      "p_value": null,
      "replicated": false,
      "replication_count": null,
      "sample_size": null,
      "species": null,
      "type": "computational"
     },
     "flags": {
      "contested": false,
      "declining": false,
      "gap": true,
      "gravity_well": false,
      "negative_space": false,
      "retracted": false
     },
     "id": "vf_41030d44f59eae22",
     "links": [],
     "previous_version": null,
     "provenance": {
      "authors": [
       {
        "name": "reviewer:will-blair",
        "orcid": null
       }
      ],
      "citation_count": null,
      "doi": null,
      "extraction": {
       "extracted_at": "2026-06-10T06:50:55.944101+00:00",
       "extractor_version": "vela/0.691.0",
       "method": "manual_curation",
       "model": null,
       "model_version": null
      },
      "journal": null,
      "openalex_id": null,
      "pmc": null,
      "pmid": null,
      "review": {
       "corrections": [],
       "reviewed": false,
       "reviewed_at": null,
       "reviewer": null
      },
      "source_type": "expert_assertion",
      "title": "manual finding",
      "year": null
     },
     "updated": null,
     "version": 1
    }
   },
   "reason": "Manual finding added to frontier state",
   "reviewed_at": "2026-06-10T06:50:55.945284+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_41030d44f59eae22",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "applied_event_id": "vev_bd0ec86a1be50d66",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "created_at": "2026-06-10T06:50:55.928294+00:00",
   "decision_reason": "Applied locally from proposal creation",
   "id": "vpr_be9c7dcdf52b3be5",
   "kind": "finding.add",
   "payload": {
    "finding": {
     "assertion": {
      "direction": null,
      "entities": [],
      "relation": null,
      "text": "BENCHMARK CLAIM (ProteinGym) — EVE (evolutionary VAE over an MSA) REPORTS strong variant-effect prediction, especially for clinical variants. VERIFICATION STATE: author-reported; fully MSA-dependent; per-protein model fitting. NOT re-run here. Open obligation: re-fit on pinned MSAs and confirm the held-out assay Spearman.",
      "type": "computational"
     },
     "conditions": {
      "age_group": null,
      "cell_type": null,
      "clinical_trial": false,
      "concentration_range": null,
      "duration": null,
      "human_data": false,
      "in_vitro": false,
      "in_vivo": false,
      "species_unverified": [],
      "species_verified": [],
      "text": "Manually added finding; requires evidence review before scientific use."
     },
     "confidence": {
      "basis": "operator-supplied frontier prior; review required",
      "extraction_confidence": 1,
      "kind": "frontier_epistemic",
      "method": "expert_judgment",
      "score": 0.3
     },
     "created": "2026-06-10T06:50:55.928257+00:00",
     "evidence": {
      "effect_size": null,
      "evidence_spans": [],
      "method": "manual state transition",
      "model_system": "",
      "p_value": null,
      "replicated": false,
      "replication_count": null,
      "sample_size": null,
      "species": null,
      "type": "computational"
     },
     "flags": {
      "contested": false,
      "declining": false,
      "gap": true,
      "gravity_well": false,
      "negative_space": false,
      "retracted": false
     },
     "id": "vf_cc50639072ba1867",
     "links": [],
     "previous_version": null,
     "provenance": {
      "authors": [
       {
        "name": "reviewer:will-blair",
        "orcid": null
       }
      ],
      "citation_count": null,
      "doi": null,
      "extraction": {
       "extracted_at": "2026-06-10T06:50:55.928244+00:00",
       "extractor_version": "vela/0.691.0",
       "method": "manual_curation",
       "model": null,
       "model_version": null
      },
      "journal": null,
      "openalex_id": null,
      "pmc": null,
      "pmid": null,
      "review": {
       "corrections": [],
       "reviewed": false,
       "reviewed_at": null,
       "reviewer": null
      },
      "source_type": "expert_assertion",
      "title": "manual finding",
      "year": null
     },
     "updated": null,
     "version": 1
    }
   },
   "reason": "Manual finding added to frontier state",
   "reviewed_at": "2026-06-10T06:50:55.929398+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_cc50639072ba1867",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "applied_event_id": "vev_07e40e45981061ad",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "created_at": "2026-06-10T06:50:55.960978+00:00",
   "decision_reason": "Applied locally from proposal creation",
   "id": "vpr_c555bef607043399",
   "kind": "finding.add",
   "payload": {
    "finding": {
     "assertion": {
      "direction": null,
      "entities": [],
      "relation": null,
      "text": "LEAKAGE HAZARD (ProteinGym). PLMs trained on UniProt may have seen sequences related to the assay proteins; 'zero-shot' is zero-shot on the LABELS, not necessarily on the SEQUENCES. VERIFICATION STATE: training-sequence overlap with assay proteins is under-audited. Open obligation: a sequence-similarity leakage audit between each model's training set and the assay proteins before banking any SOTA claim.",
      "type": "computational"
     },
     "conditions": {
      "age_group": null,
      "cell_type": null,
      "clinical_trial": false,
      "concentration_range": null,
      "duration": null,
      "human_data": false,
      "in_vitro": false,
      "in_vivo": false,
      "species_unverified": [],
      "species_verified": [],
      "text": "Manually added finding; requires evidence review before scientific use."
     },
     "confidence": {
      "basis": "operator-supplied frontier prior; review required",
      "extraction_confidence": 1,
      "kind": "frontier_epistemic",
      "method": "expert_judgment",
      "score": 0.3
     },
     "created": "2026-06-10T06:50:55.960939+00:00",
     "evidence": {
      "effect_size": null,
      "evidence_spans": [],
      "method": "manual state transition",
      "model_system": "",
      "p_value": null,
      "replicated": false,
      "replication_count": null,
      "sample_size": null,
      "species": null,
      "type": "computational"
     },
     "flags": {
      "contested": false,
      "declining": false,
      "gap": true,
      "gravity_well": false,
      "negative_space": false,
      "retracted": false
     },
     "id": "vf_8212daf3d7034a93",
     "links": [],
     "previous_version": null,
     "provenance": {
      "authors": [
       {
        "name": "reviewer:will-blair",
        "orcid": null
       }
      ],
      "citation_count": null,
      "doi": null,
      "extraction": {
       "extracted_at": "2026-06-10T06:50:55.960927+00:00",
       "extractor_version": "vela/0.691.0",
       "method": "manual_curation",
       "model": null,
       "model_version": null
      },
      "journal": null,
      "openalex_id": null,
      "pmc": null,
      "pmid": null,
      "review": {
       "corrections": [],
       "reviewed": false,
       "reviewed_at": null,
       "reviewer": null
      },
      "source_type": "expert_assertion",
      "title": "manual finding",
      "year": null
     },
     "updated": null,
     "version": 1
    }
   },
   "reason": "Manual finding added to frontier state",
   "reviewed_at": "2026-06-10T06:50:55.962181+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_8212daf3d7034a93",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "applied_event_id": "vev_5a33eaff97407ac8",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "created_at": "2026-06-10T06:50:55.819166+00:00",
   "decision_reason": "Applied locally from proposal creation",
   "id": "vpr_ce433e03bf245f79",
   "kind": "finding.add",
   "payload": {
    "finding": {
     "assertion": {
      "direction": null,
      "entities": [],
      "relation": null,
      "text": "BENCHMARK META (MiniF2F). MiniF2F is ~488 olympiad/textbook formal-math problems (AMC/AIME/IMO + MATH), ported to Lean/Isabelle/HOL-Light/Metamath, split valid/test. KNOWN TRUST ISSUE: multiple incompatible versions exist (original 2021, miniF2F-v2, and the 'miniF2F Revisited' cleanup with corrected/changed statements), so pass-rates across papers are version-ambiguous unless the exact split is pinned. STATE: dataset-version hazard, not a model claim.",
      "type": "computational"
     },
     "conditions": {
      "age_group": null,
      "cell_type": null,
      "clinical_trial": false,
      "concentration_range": null,
      "duration": null,
      "human_data": false,
      "in_vitro": false,
      "in_vivo": false,
      "species_unverified": [],
      "species_verified": [],
      "text": "Manually added finding; requires evidence review before scientific use."
     },
     "confidence": {
      "basis": "operator-supplied frontier prior; review required",
      "extraction_confidence": 1,
      "kind": "frontier_epistemic",
      "method": "expert_judgment",
      "score": 0.3
     },
     "created": "2026-06-10T06:50:55.819124+00:00",
     "evidence": {
      "effect_size": null,
      "evidence_spans": [],
      "method": "manual state transition",
      "model_system": "",
      "p_value": null,
      "replicated": false,
      "replication_count": null,
      "sample_size": null,
      "species": null,
      "type": "computational"
     },
     "flags": {
      "contested": false,
      "declining": false,
      "gap": true,
      "gravity_well": false,
      "negative_space": false,
      "retracted": false
     },
     "id": "vf_cf89ac0f36e62089",
     "links": [],
     "previous_version": null,
     "provenance": {
      "authors": [
       {
        "name": "reviewer:will-blair",
        "orcid": null
       }
      ],
      "citation_count": null,
      "doi": null,
      "extraction": {
       "extracted_at": "2026-06-10T06:50:55.819111+00:00",
       "extractor_version": "vela/0.691.0",
       "method": "manual_curation",
       "model": null,
       "model_version": null
      },
      "journal": null,
      "openalex_id": null,
      "pmc": null,
      "pmid": null,
      "review": {
       "corrections": [],
       "reviewed": false,
       "reviewed_at": null,
       "reviewer": null
      },
      "source_type": "expert_assertion",
      "title": "manual finding",
      "year": null
     },
     "updated": null,
     "version": 1
    }
   },
   "reason": "Manual finding added to frontier state",
   "reviewed_at": "2026-06-10T06:50:55.819740+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_cf89ac0f36e62089",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "agent:hardening-2026-06-10",
    "type": "human"
   },
   "applied_event_id": "vev_d199cb2e417c4f42",
   "caveats": [],
   "created_at": "2026-06-10T22:35:36.074822+00:00",
   "decision_reason": "Hardening: label provenance is attested (records-not-reruns); caps below verified until a deterministic rederivation exists",
   "id": "vpr_cf4939a974d7a904",
   "kind": "finding.note",
   "payload": {
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "reviewed_at": "2026-06-10T23:01:45.244961+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_fec6f956d525e753",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "agent:hardening-2026-06-10",
    "type": "human"
   },
   "applied_event_id": "vev_5064c841d30e8aaf",
   "caveats": [],
   "created_at": "2026-06-10T22:35:35.881842+00:00",
   "decision_reason": "Hardening: label provenance is attested (records-not-reruns); caps below verified until a deterministic rederivation exists",
   "id": "vpr_d1d96c52036f153d",
   "kind": "finding.note",
   "payload": {
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "reviewed_at": "2026-06-10T23:01:45.014937+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_170c9a0e01a9b1d3",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "applied_event_id": "vev_f17f5a864754e2a0",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "created_at": "2026-06-10T06:50:55.886493+00:00",
   "decision_reason": "Applied locally from proposal creation",
   "id": "vpr_d3f3228bb463c2d9",
   "kind": "finding.add",
   "payload": {
    "finding": {
     "assertion": {
      "direction": null,
      "entities": [],
      "relation": null,
      "text": "BENCHMARK META (ProteinGym). ProteinGym benchmarks variant-effect prediction against deep mutational scanning (DMS) assays: a substitution benchmark (~217 assays) and an indel benchmark, with zero-shot and supervised tracks, scored by Spearman correlation (and AUC/MCC). KNOWN TRUST ISSUE: v1.0 vs v1.1 differ in assay set and splits; zero-shot vs supervised numbers are not comparable; MSA-dependent methods vary with the MSA pipeline. STATE: dataset-version + track-conflation hazard.",
      "type": "computational"
     },
     "conditions": {
      "age_group": null,
      "cell_type": null,
      "clinical_trial": false,
      "concentration_range": null,
      "duration": null,
      "human_data": false,
      "in_vitro": false,
      "in_vivo": false,
      "species_unverified": [],
      "species_verified": [],
      "text": "Manually added finding; requires evidence review before scientific use."
     },
     "confidence": {
      "basis": "operator-supplied frontier prior; review required",
      "extraction_confidence": 1,
      "kind": "frontier_epistemic",
      "method": "expert_judgment",
      "score": 0.3
     },
     "created": "2026-06-10T06:50:55.886459+00:00",
     "evidence": {
      "effect_size": null,
      "evidence_spans": [],
      "method": "manual state transition",
      "model_system": "",
      "p_value": null,
      "replicated": false,
      "replication_count": null,
      "sample_size": null,
      "species": null,
      "type": "computational"
     },
     "flags": {
      "contested": false,
      "declining": false,
      "gap": true,
      "gravity_well": false,
      "negative_space": false,
      "retracted": false
     },
     "id": "vf_ec4bb8feca206bf2",
     "links": [],
     "previous_version": null,
     "provenance": {
      "authors": [
       {
        "name": "reviewer:will-blair",
        "orcid": null
       }
      ],
      "citation_count": null,
      "doi": null,
      "extraction": {
       "extracted_at": "2026-06-10T06:50:55.886449+00:00",
       "extractor_version": "vela/0.691.0",
       "method": "manual_curation",
       "model": null,
       "model_version": null
      },
      "journal": null,
      "openalex_id": null,
      "pmc": null,
      "pmid": null,
      "review": {
       "corrections": [],
       "reviewed": false,
       "reviewed_at": null,
       "reviewer": null
      },
      "source_type": "expert_assertion",
      "title": "manual finding",
      "year": null
     },
     "updated": null,
     "version": 1
    }
   },
   "reason": "Manual finding added to frontier state",
   "reviewed_at": "2026-06-10T06:50:55.887342+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_ec4bb8feca206bf2",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "agent:hardening-2026-06-10",
    "type": "human"
   },
   "applied_event_id": "vev_eb4221b9d34b54cd",
   "caveats": [],
   "created_at": "2026-06-10T22:35:35.856977+00:00",
   "decision_reason": "Hardening: label provenance is attested (records-not-reruns); caps below verified until a deterministic rederivation exists",
   "id": "vpr_e4bebe83c4ec21eb",
   "kind": "finding.note",
   "payload": {
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "reviewed_at": "2026-06-10T23:01:44.987488+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_03776d6cd3e0801b",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "agent:hardening-2026-06-10",
    "type": "human"
   },
   "applied_event_id": "vev_a73023eb43fa7387",
   "caveats": [],
   "created_at": "2026-06-10T22:35:35.918103+00:00",
   "decision_reason": "Hardening: label provenance is attested (records-not-reruns); caps below verified until a deterministic rederivation exists",
   "id": "vpr_edef714318aa82be",
   "kind": "finding.note",
   "payload": {
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "reviewed_at": "2026-06-10T23:01:45.061177+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_41030d44f59eae22",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "applied_event_id": "vev_e5d45a5605897295",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "created_at": "2026-06-10T06:50:55.839740+00:00",
   "decision_reason": "Applied locally from proposal creation",
   "id": "vpr_f08818383df2a902",
   "kind": "finding.add",
   "payload": {
    "finding": {
     "assertion": {
      "direction": null,
      "entities": [],
      "relation": null,
      "text": "BENCHMARK CLAIM (MiniF2F) — Draft-Sketch-Prove (DSP) REPORTS improved miniF2F-test pass by drafting an informal proof, sketching a formal skeleton, then closing gaps with an ATP. VERIFICATION STATE: author-reported; pipeline described; depends on the underlying ATP and the autoformalizer, both of which drift. NOT re-run here. Open obligation: reproduce with pinned ATP + LLM versions.",
      "type": "computational"
     },
     "conditions": {
      "age_group": null,
      "cell_type": null,
      "clinical_trial": false,
      "concentration_range": null,
      "duration": null,
      "human_data": false,
      "in_vitro": false,
      "in_vivo": false,
      "species_unverified": [],
      "species_verified": [],
      "text": "Manually added finding; requires evidence review before scientific use."
     },
     "confidence": {
      "basis": "operator-supplied frontier prior; review required",
      "extraction_confidence": 1,
      "kind": "frontier_epistemic",
      "method": "expert_judgment",
      "score": 0.3
     },
     "created": "2026-06-10T06:50:55.839703+00:00",
     "evidence": {
      "effect_size": null,
      "evidence_spans": [],
      "method": "manual state transition",
      "model_system": "",
      "p_value": null,
      "replicated": false,
      "replication_count": null,
      "sample_size": null,
      "species": null,
      "type": "computational"
     },
     "flags": {
      "contested": false,
      "declining": false,
      "gap": true,
      "gravity_well": false,
      "negative_space": false,
      "retracted": false
     },
     "id": "vf_368ec6ffb5747092",
     "links": [],
     "previous_version": null,
     "provenance": {
      "authors": [
       {
        "name": "reviewer:will-blair",
        "orcid": null
       }
      ],
      "citation_count": null,
      "doi": null,
      "extraction": {
       "extracted_at": "2026-06-10T06:50:55.839691+00:00",
       "extractor_version": "vela/0.691.0",
       "method": "manual_curation",
       "model": null,
       "model_version": null
      },
      "journal": null,
      "openalex_id": null,
      "pmc": null,
      "pmid": null,
      "review": {
       "corrections": [],
       "reviewed": false,
       "reviewed_at": null,
       "reviewer": null
      },
      "source_type": "expert_assertion",
      "title": "manual finding",
      "year": null
     },
     "updated": null,
     "version": 1
    }
   },
   "reason": "Manual finding added to frontier state",
   "reviewed_at": "2026-06-10T06:50:55.840379+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_368ec6ffb5747092",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "reviewer:will-blair",
    "type": "human"
   },
   "applied_event_id": "vev_b396d3a2727ae019",
   "caveats": [
    "Manual findings require evidence review before scientific use."
   ],
   "created_at": "2026-06-10T06:50:55.829250+00:00",
   "decision_reason": "Applied locally from proposal creation",
   "id": "vpr_f3a3a73919f9eb51",
   "kind": "finding.add",
   "payload": {
    "finding": {
     "assertion": {
      "direction": null,
      "entities": [],
      "relation": null,
      "text": "BENCHMARK CLAIM (MiniF2F) — DeepSeek-Prover-V1.5 REPORTS a leading miniF2F-test pass rate under a large sampling budget (RMaxTS). VERIFICATION STATE: author-reported; model weights public; eval harness in the paper; dataset version = the team's stated split. NOT independently re-run in this frontier. Open obligation: pin the split, re-run the released checkpoint, audit train/test contamination of the formal statements.",
      "type": "computational"
     },
     "conditions": {
      "age_group": null,
      "cell_type": null,
      "clinical_trial": false,
      "concentration_range": null,
      "duration": null,
      "human_data": false,
      "in_vitro": false,
      "in_vivo": false,
      "species_unverified": [],
      "species_verified": [],
      "text": "Manually added finding; requires evidence review before scientific use."
     },
     "confidence": {
      "basis": "operator-supplied frontier prior; review required",
      "extraction_confidence": 1,
      "kind": "frontier_epistemic",
      "method": "expert_judgment",
      "score": 0.3
     },
     "created": "2026-06-10T06:50:55.829210+00:00",
     "evidence": {
      "effect_size": null,
      "evidence_spans": [],
      "method": "manual state transition",
      "model_system": "",
      "p_value": null,
      "replicated": false,
      "replication_count": null,
      "sample_size": null,
      "species": null,
      "type": "computational"
     },
     "flags": {
      "contested": false,
      "declining": false,
      "gap": true,
      "gravity_well": false,
      "negative_space": false,
      "retracted": false
     },
     "id": "vf_55068262f49df0ab",
     "links": [],
     "previous_version": null,
     "provenance": {
      "authors": [
       {
        "name": "reviewer:will-blair",
        "orcid": null
       }
      ],
      "citation_count": null,
      "doi": null,
      "extraction": {
       "extracted_at": "2026-06-10T06:50:55.829198+00:00",
       "extractor_version": "vela/0.691.0",
       "method": "manual_curation",
       "model": null,
       "model_version": null
      },
      "journal": null,
      "openalex_id": null,
      "pmc": null,
      "pmid": null,
      "review": {
       "corrections": [],
       "reviewed": false,
       "reviewed_at": null,
       "reviewer": null
      },
      "source_type": "expert_assertion",
      "title": "manual finding",
      "year": null
     },
     "updated": null,
     "version": 1
    }
   },
   "reason": "Manual finding added to frontier state",
   "reviewed_at": "2026-06-10T06:50:55.829820+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_55068262f49df0ab",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "agent:hardening-2026-06-10",
    "type": "human"
   },
   "applied_event_id": "vev_804497a5a8fbe4a0",
   "caveats": [],
   "created_at": "2026-06-10T22:35:35.937348+00:00",
   "decision_reason": "Hardening: label provenance is attested (records-not-reruns); caps below verified until a deterministic rederivation exists",
   "id": "vpr_fb5a71c197133639",
   "kind": "finding.note",
   "payload": {
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "reviewed_at": "2026-06-10T23:01:45.084548+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_55068262f49df0ab",
    "type": "finding"
   }
  },
  {
   "actor": {
    "id": "agent:hardening-2026-06-10",
    "type": "human"
   },
   "applied_event_id": "vev_c4da9db8be63634e",
   "caveats": [],
   "created_at": "2026-06-10T22:35:35.957332+00:00",
   "decision_reason": "Hardening: label provenance is attested (records-not-reruns); caps below verified until a deterministic rederivation exists",
   "id": "vpr_fd307d5d15c2cef7",
   "kind": "finding.note",
   "payload": {
    "text": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists."
   },
   "reason": "HARDENING (benchmark-state): label_provenance=attested (records-not-reruns; ground truth is an answer key, not a frozen-verifier rederivation), valid_as_of=2026-06-10, model_cutoff=unknown. Under the trust ladder, attested label provenance caps this record below 'verified' until a deterministic rederivation exists.",
   "reviewed_at": "2026-06-10T23:01:45.107650+00:00",
   "reviewed_by": "reviewer:will-blair",
   "schema": "vela.proposal.v0.1",
   "source_refs": [],
   "status": "applied",
   "target": {
    "id": "vf_8212daf3d7034a93",
    "type": "finding"
   }
  }
 ],
 "statement_attestations": [],
 "statement_registrations": [],
 "attempt_claims": [],
 "verifier_attachments": [],
 "transfers": [],
 "endorsements": [],
 "sources": [
  {
   "authors": [
    "reviewer:will-blair"
   ],
   "caveats": [],
   "extraction_mode": "manual_curation",
   "finding_ids": [
    "vf_03776d6cd3e0801b",
    "vf_170c9a0e01a9b1d3",
    "vf_368ec6ffb5747092",
    "vf_41030d44f59eae22",
    "vf_55068262f49df0ab",
    "vf_8212daf3d7034a93",
    "vf_9a454a597ddee070",
    "vf_cc50639072ba1867",
    "vf_cf89ac0f36e62089",
    "vf_dce7a34adf2878f2",
    "vf_ec4bb8feca206bf2",
    "vf_fec6f956d525e753"
   ],
   "id": "vs_066123dd29a9c5b4",
   "imported_at": "2026-06-10T06:50:55.899492+00:00",
   "locator": "title:manual finding",
   "source_quality": "declared",
   "source_type": "paper",
   "title": "manual finding"
  }
 ],
 "evidence_atoms": [
  {
   "caveats": [
    "missing evidence locator"
   ],
   "condition_refs": [
    "vcnd_ac78fb246103bc8c"
   ],
   "evidence_type": "computational",
   "extraction_method": "manual_curation",
   "finding_id": "vf_cf89ac0f36e62089",
   "human_verified": false,
   "id": "vea_107364fe31419d2d",
   "measurement_or_claim": "BENCHMARK META (MiniF2F). MiniF2F is ~488 olympiad/textbook formal-math problems (AMC/AIME/IMO + MATH), ported to Lean/Isabelle/HOL-Light/Metamath, split valid/test. KNOWN TRUST ISSUE: multiple incompatible versions exist (original 2021, miniF2F-v2, and the 'miniF2F Revisited' cleanup with corrected/changed statements), so pass-rates across papers are version-ambiguous unless the exact split is pinned. STATE: dataset-version hazard, not a model claim.",
   "source_id": "vs_066123dd29a9c5b4",
   "supports_or_challenges": "unknown"
  },
  {
   "caveats": [
    "missing evidence locator"
   ],
   "condition_refs": [
    "vcnd_a87252067ed53c46"
   ],
   "evidence_type": "computational",
   "extraction_method": "manual_curation",
   "finding_id": "vf_170c9a0e01a9b1d3",
   "human_verified": false,
   "id": "vea_19a1954be8602f06",
   "measurement_or_claim": "BENCHMARK CLAIM (ProteinGym) — Tranception + retrieval (and TranceptEVE, combining Tranception with the EVE family model) REPORT leading zero-shot Spearman by mixing an autoregressive PLM with MSA-derived statistics. VERIFICATION STATE: author-reported; MSA-dependent, so the number moves with the alignment pipeline. NOT re-run here. Open obligation: reproduce with the stated MSAs and depth.",
   "source_id": "vs_066123dd29a9c5b4",
   "supports_or_challenges": "unknown"
  },
  {
   "caveats": [
    "missing evidence locator"
   ],
   "condition_refs": [
    "vcnd_1e6c4622a75133ac"
   ],
   "evidence_type": "computational",
   "extraction_method": "manual_curation",
   "finding_id": "vf_368ec6ffb5747092",
   "human_verified": false,
   "id": "vea_1c96c2bca6cabe6d",
   "measurement_or_claim": "BENCHMARK CLAIM (MiniF2F) — Draft-Sketch-Prove (DSP) REPORTS improved miniF2F-test pass by drafting an informal proof, sketching a formal skeleton, then closing gaps with an ATP. VERIFICATION STATE: author-reported; pipeline described; depends on the underlying ATP and the autoformalizer, both of which drift. NOT re-run here. Open obligation: reproduce with pinned ATP + LLM versions.",
   "source_id": "vs_066123dd29a9c5b4",
   "supports_or_challenges": "unknown"
  },
  {
   "caveats": [
    "missing evidence locator"
   ],
   "condition_refs": [
    "vcnd_80603ce3c91d9931"
   ],
   "evidence_type": "computational",
   "extraction_method": "manual_curation",
   "finding_id": "vf_55068262f49df0ab",
   "human_verified": false,
   "id": "vea_2ac0e43858a68cb9",
   "measurement_or_claim": "BENCHMARK CLAIM (MiniF2F) — DeepSeek-Prover-V1.5 REPORTS a leading miniF2F-test pass rate under a large sampling budget (RMaxTS). VERIFICATION STATE: author-reported; model weights public; eval harness in the paper; dataset version = the team's stated split. NOT independently re-run in this frontier. Open obligation: pin the split, re-run the released checkpoint, audit train/test contamination of the formal statements.",
   "source_id": "vs_066123dd29a9c5b4",
   "supports_or_challenges": "unknown"
  },
  {
   "caveats": [
    "missing evidence locator"
   ],
   "condition_refs": [
    "vcnd_58f0c2d7b7af876b"
   ],
   "evidence_type": "computational",
   "extraction_method": "manual_curation",
   "finding_id": "vf_41030d44f59eae22",
   "human_verified": false,
   "id": "vea_3818a2b502e64c42",
   "measurement_or_claim": "BENCHMARK CLAIM (ProteinGym) — ProteinNPT (non-parametric transformer, supervised track) REPORTS gains by attending across labelled neighbours. VERIFICATION STATE: author-reported; SUPERVISED — not comparable to zero-shot numbers; depends on the cross-validation split. NOT re-run here. Open obligation: re-run under the official supervised CV split; never compare against zero-shot rows.",
   "source_id": "vs_066123dd29a9c5b4",
   "supports_or_challenges": "unknown"
  },
  {
   "caveats": [
    "missing evidence locator"
   ],
   "condition_refs": [
    "vcnd_f925e73d7e12806e"
   ],
   "evidence_type": "computational",
   "extraction_method": "manual_curation",
   "finding_id": "vf_9a454a597ddee070",
   "human_verified": false,
   "id": "vea_4ac62ba55a4b8dbc",
   "measurement_or_claim": "BENCHMARK CLAIM (MiniF2F) — HyperTree Proof Search (HTPS, Lample et al.) REPORTS a miniF2F pass rate via learned best-first proof search. VERIFICATION STATE: author-reported; search budget and version-specific. NOT re-run here. Open obligation: re-run at the stated budget on a pinned split.",
   "source_id": "vs_066123dd29a9c5b4",
   "supports_or_challenges": "unknown"
  },
  {
   "caveats": [
    "missing evidence locator"
   ],
   "condition_refs": [
    "vcnd_922db3d676f1352f"
   ],
   "evidence_type": "computational",
   "extraction_method": "manual_curation",
   "finding_id": "vf_cc50639072ba1867",
   "human_verified": false,
   "id": "vea_701b8b3ab51f97af",
   "measurement_or_claim": "BENCHMARK CLAIM (ProteinGym) — EVE (evolutionary VAE over an MSA) REPORTS strong variant-effect prediction, especially for clinical variants. VERIFICATION STATE: author-reported; fully MSA-dependent; per-protein model fitting. NOT re-run here. Open obligation: re-fit on pinned MSAs and confirm the held-out assay Spearman.",
   "source_id": "vs_066123dd29a9c5b4",
   "supports_or_challenges": "unknown"
  },
  {
   "caveats": [
    "missing evidence locator"
   ],
   "condition_refs": [
    "vcnd_413cc24b2718e0d3"
   ],
   "evidence_type": "computational",
   "extraction_method": "manual_curation",
   "finding_id": "vf_fec6f956d525e753",
   "human_verified": false,
   "id": "vea_78b5ea08545ad462",
   "measurement_or_claim": "BENCHMARK CLAIM (MiniF2F) — AlphaProof (DeepMind) solved several IMO-2024 problems formally in Lean; this is sometimes conflated with a miniF2F number. VERIFICATION STATE: the IMO result is a separate, time-bounded competition claim, NOT a miniF2F-test pass rate; no public checkpoint. Open obligation: do not record an AlphaProof miniF2F figure without a cited, pinned evaluation.",
   "source_id": "vs_066123dd29a9c5b4",
   "supports_or_challenges": "unknown"
  },
  {
   "caveats": [
    "missing evidence locator"
   ],
   "condition_refs": [
    "vcnd_f640e1e86c8fd161"
   ],
   "evidence_type": "computational",
   "extraction_method": "manual_curation",
   "finding_id": "vf_8212daf3d7034a93",
   "human_verified": false,
   "id": "vea_874a2c26d4e672f2",
   "measurement_or_claim": "LEAKAGE HAZARD (ProteinGym). PLMs trained on UniProt may have seen sequences related to the assay proteins; 'zero-shot' is zero-shot on the LABELS, not necessarily on the SEQUENCES. VERIFICATION STATE: training-sequence overlap with assay proteins is under-audited. Open obligation: a sequence-similarity leakage audit between each model's training set and the assay proteins before banking any SOTA claim.",
   "source_id": "vs_066123dd29a9c5b4",
   "supports_or_challenges": "unknown"
  },
  {
   "caveats": [
    "missing evidence locator"
   ],
   "condition_refs": [
    "vcnd_bb72edd2de97ec48"
   ],
   "evidence_type": "computational",
   "extraction_method": "manual_curation",
   "finding_id": "vf_03776d6cd3e0801b",
   "human_verified": false,
   "id": "vea_a9c4e7b494465d60",
   "measurement_or_claim": "BENCHMARK CLAIM (ProteinGym) — ESM-1v REPORTS strong zero-shot substitution performance from masked-marginal scoring of a protein language model. VERIFICATION STATE: author-reported; weights public; depends on the scoring convention (masked-marginal vs wt-marginal) and the ProteinGym version. NOT re-run here. Open obligation: re-score the released model on the pinned v1.1 zero-shot substitution set.",
   "source_id": "vs_066123dd29a9c5b4",
   "supports_or_challenges": "unknown"
  },
  {
   "caveats": [
    "missing evidence locator"
   ],
   "condition_refs": [
    "vcnd_80e110739db57437"
   ],
   "evidence_type": "computational",
   "extraction_method": "manual_curation",
   "finding_id": "vf_ec4bb8feca206bf2",
   "human_verified": false,
   "id": "vea_ad36ad1c4b0f546f",
   "measurement_or_claim": "BENCHMARK META (ProteinGym). ProteinGym benchmarks variant-effect prediction against deep mutational scanning (DMS) assays: a substitution benchmark (~217 assays) and an indel benchmark, with zero-shot and supervised tracks, scored by Spearman correlation (and AUC/MCC). KNOWN TRUST ISSUE: v1.0 vs v1.1 differ in assay set and splits; zero-shot vs supervised numbers are not comparable; MSA-dependent methods vary with the MSA pipeline. STATE: dataset-version + track-conflation hazard.",
   "source_id": "vs_066123dd29a9c5b4",
   "supports_or_challenges": "unknown"
  },
  {
   "caveats": [
    "missing evidence locator"
   ],
   "condition_refs": [
    "vcnd_a9adaea9be50fa70"
   ],
   "evidence_type": "computational",
   "extraction_method": "manual_curation",
   "finding_id": "vf_dce7a34adf2878f2",
   "human_verified": false,
   "id": "vea_c7b4329c7e40cedc",
   "measurement_or_claim": "FAITHFULNESS HAZARD (MiniF2F). A reported 'solve' is only as good as the autoformalized statement matching the intended problem; the miniF2F Revisited effort found statements that were mis-stated or trivially true. VERIFICATION STATE: faithfulness of the FORMAL statement to the INFORMAL problem is the under-checked axis. Open obligation: every banked miniF2F solve needs a statement-faithfulness attestation (vela attest --scope formalism-fidelity).",
   "source_id": "vs_066123dd29a9c5b4",
   "supports_or_challenges": "unknown"
  }
 ]
}
