<?xml version="1.0" encoding="UTF-8"?><rss xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:atom="http://www.w3.org/2005/Atom" version="2.0" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:googleplay="http://www.google.com/schemas/play-podcasts/1.0"><channel><title><![CDATA[Resilient X Design]]></title><description><![CDATA[ Building robust machine learning systems. ]]></description><link>https://resilient.safeintelligence.ai</link><image><url>https://substackcdn.com/image/fetch/$s_!3ohK!,w_256,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc4c4f647-c19b-47ef-b634-48b744d904c8_1009x1009.png</url><title>Resilient X Design</title><link>https://resilient.safeintelligence.ai</link></image><generator>Substack</generator><lastBuildDate>Thu, 07 May 2026 11:32:32 GMT</lastBuildDate><atom:link href="https://resilient.safeintelligence.ai/feed" rel="self" type="application/rss+xml"/><copyright><![CDATA[Safe Intelligence]]></copyright><language><![CDATA[en]]></language><webMaster><![CDATA[safeintelai@substack.com]]></webMaster><itunes:owner><itunes:email><![CDATA[safeintelai@substack.com]]></itunes:email><itunes:name><![CDATA[Safe Intelligence]]></itunes:name></itunes:owner><itunes:author><![CDATA[Safe Intelligence]]></itunes:author><googleplay:owner><![CDATA[safeintelai@substack.com]]></googleplay:owner><googleplay:email><![CDATA[safeintelai@substack.com]]></googleplay:email><googleplay:author><![CDATA[Safe Intelligence]]></googleplay:author><itunes:block><![CDATA[Yes]]></itunes:block><item><title><![CDATA[AI Deployments and Safety in 2026]]></title><description><![CDATA[Looking forward to great year in safe AI adoption]]></description><link>https://resilient.safeintelligence.ai/p/ai-deployments-and-safety-in-2026</link><guid isPermaLink="false">https://resilient.safeintelligence.ai/p/ai-deployments-and-safety-in-2026</guid><dc:creator><![CDATA[Safe Intelligence]]></dc:creator><pubDate>Fri, 06 Feb 2026 10:20:40 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!a3XS!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f0e45d2-5ef5-4d9c-a771-77974592f4e4_1024x716.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!a3XS!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f0e45d2-5ef5-4d9c-a771-77974592f4e4_1024x716.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!a3XS!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f0e45d2-5ef5-4d9c-a771-77974592f4e4_1024x716.png 424w, https://substackcdn.com/image/fetch/$s_!a3XS!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f0e45d2-5ef5-4d9c-a771-77974592f4e4_1024x716.png 848w, https://substackcdn.com/image/fetch/$s_!a3XS!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f0e45d2-5ef5-4d9c-a771-77974592f4e4_1024x716.png 1272w, https://substackcdn.com/image/fetch/$s_!a3XS!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f0e45d2-5ef5-4d9c-a771-77974592f4e4_1024x716.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!a3XS!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f0e45d2-5ef5-4d9c-a771-77974592f4e4_1024x716.png" width="710" height="496.4453125" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/5f0e45d2-5ef5-4d9c-a771-77974592f4e4_1024x716.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:716,&quot;width&quot;:1024,&quot;resizeWidth&quot;:710,&quot;bytes&quot;:1670713,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/186825837?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F464a82d1-89e3-49e0-a0d2-1b5a1b1c9ff0_1024x1024.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!a3XS!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f0e45d2-5ef5-4d9c-a771-77974592f4e4_1024x716.png 424w, https://substackcdn.com/image/fetch/$s_!a3XS!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f0e45d2-5ef5-4d9c-a771-77974592f4e4_1024x716.png 848w, https://substackcdn.com/image/fetch/$s_!a3XS!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f0e45d2-5ef5-4d9c-a771-77974592f4e4_1024x716.png 1272w, https://substackcdn.com/image/fetch/$s_!a3XS!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f0e45d2-5ef5-4d9c-a771-77974592f4e4_1024x716.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Last year on these pages we covered a range of topics from ML <a href="https://resilient.safeintelligence.ai/p/ml-testing-refresher-ae9">testing basics</a> to <a href="https://resilient.safeintelligence.ai/p/formal-verification-of-ml">formal verification</a> and <a href="https://resilient.safeintelligence.ai/p/when-machine-learning-models-stop">drift</a>. This year we&#8217;ll be picking up those threads again and more. AI / ML deployment has continued apace and it is hard to keep up with new developments! What is clear is that the need for good validation, assurance and robustness has only grown:</p><ul><li><p>Autonomous flight plans are advancing in many parts of the world. Zipline has <a href="https://techcrunch.com/2026/01/21/zipline-charts-drone-delivery-expansion-with-600m-in-new-funding/">announced</a> autonomous drone delivery for Huston and Phoenix in the United States. Walmart and Wing <a href="https://www.wired.com/story/walmart-wing-expand-drone-delivery/">announced</a> a major expansion of drone delivery to 100 additional U.S. stores, enabled by operational authorizations from the Federal Aviation Administration. In other places like the United Arab Emirates, authorities began mapping air corridors for air taxis and cargo drones; in parallel, there are plans for public <a href="https://spectrum.ieee.org/joby-air-taxi">air-taxi operations in Dubai in 2026</a>.</p></li></ul><ul><li><p>Driverless cars are set to reach more cities with <a href="https://waymo.com/blog/2025/10/hello-london-your-waymo-ride-is-arriving">Waymo coming to London in 2026</a>.</p></li></ul><ul><li><p>Enterprise deployments of AI based Agents is on the agenda for almost every large company, many of which are designed to work fully autonomously.</p></li></ul><ul><li><p>In many cases organizations are no longer just building their AI systems and agents from scratch but using off-the-shelf frameworks and components. This adds an extra layer of challenge to the task of validation since teams often don&#8217;t have full access to the model itself.</p></li></ul><p>This year we&#8217;ll dig further into testing and validation of AI to meet these challenges. For now though, here are five of our favorite posts from last year!</p><ul><li><p>ML Testing Refresher - Aka Skyscrapers and Rocks (<a href="https://resilient.safeintelligence.ai/p/ml-testing-refresher-ae9">https://resilient.safeintelligence.ai/p/ml-testing-refresher-ae9</a>). This article covers why testing and validation of ML is so hard (and necessary).</p></li><li><p>The ML Benchmarking Primer (<a href="https://resilient.safeintelligence.ai/p/ml-benchmarking-primer">https://resilient.safeintelligence.ai/p/ml-benchmarking-primer</a>). Going beyond testing to benchmarking and continual performance improvements.</p></li><li><p>Formal Verification of ML (<a href="https://resilient.safeintelligence.ai/p/formal-verification-of-ml">https://resilient.safeintelligence.ai/p/formal-verification-of-ml</a>). What happens when you want to go beyond testing and deeply analyze the properties of a model.</p></li><li><p>VNN-COMP: Benchmarking the Verification of Neural Networks (<a href="https://resilient.safeintelligence.ai/p/vnn-comp-benchmarking-the-verification">https://resilient.safeintelligence.ai/p/vnn-comp-benchmarking-the-verification</a>). Covering the trailblazing competition that helps push the boundaries of what&#8217;s possible in formal verification each year.</p></li><li><p>When Machine Learning Models Stop Seeing Clearly (<a href="https://resilient.safeintelligence.ai/p/when-machine-learning-models-stop">https://resilient.safeintelligence.ai/p/when-machine-learning-models-stop</a>). A nice primer on what drift is and why it needs to be kept in check!</p></li></ul><p>We&#8217;re looking forward to a great year in safe AI adoption!</p><div class="captioned-button-wrap" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/ai-deployments-and-safety-in-2026?utm_source=substack&utm_medium=email&utm_content=share&action=share&quot;,&quot;text&quot;:&quot;Share&quot;}" data-component-name="CaptionedButtonToDOM"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient X Design! This post is public so feel free to share it.</p></div><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/ai-deployments-and-safety-in-2026?utm_source=substack&utm_medium=email&utm_content=share&action=share&quot;,&quot;text&quot;:&quot;Share&quot;}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/ai-deployments-and-safety-in-2026?utm_source=substack&utm_medium=email&utm_content=share&action=share"><span>Share</span></a></p></div><p></p>]]></content:encoded></item><item><title><![CDATA[When Machine Learning Models Stop Seeing Clearly]]></title><description><![CDATA[Drift Happens!]]></description><link>https://resilient.safeintelligence.ai/p/when-machine-learning-models-stop</link><guid isPermaLink="false">https://resilient.safeintelligence.ai/p/when-machine-learning-models-stop</guid><dc:creator><![CDATA[Brain Aboze]]></dc:creator><pubDate>Tue, 11 Nov 2025 09:01:53 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!ilWg!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<blockquote><p>A vision model trained on London&#8217;s summer mornings can start losing confidence by November. The city turns reflective; to the human eye, the scene is still London, but to the model, it&#8217;s an entirely new world. This is <strong>drift</strong>, not a server crash, not an obvious bug, but a quiet decay that begins the moment the world changes faster than the model can keep up.</p></blockquote><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!ilWg!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!ilWg!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!ilWg!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!ilWg!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!ilWg!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!ilWg!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png" width="1024" height="1024" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:1910593,&quot;alt&quot;:&quot;&quot;,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/177460992?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" title="" srcset="https://substackcdn.com/image/fetch/$s_!ilWg!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!ilWg!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!ilWg!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!ilWg!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2d7aabaf-de4c-4fe6-b6ca-099d23f90c85_1024x1024.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Machine learning systems rarely fail overnight. Their decline is gradual, almost imperceptible. The model keeps running and predictions keep flowing, yet beneath the surface, a silent divergence grows between what the model <em>remembers</em> and what it <em>sees</em>. Over time, what began as a few misclassifications on a foggy morning can grow into systemic misjudgement, not because the model failed, but because <em><strong>London moved on, and the model didn&#8217;t</strong></em>.</p><h1>What exactly is Drift?</h1><p>At its core, <strong>drift is the degradation of model performance caused by a divergence between the production and training environments.</strong></p><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient X Design! Subscribe for free to receive new posts and support my work.</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><p>It can emerge from anywhere &#8212; new camera sensors, shifting seasonal light, or evolving user behaviour. Every model begins life as a frozen snapshot of the world; drift begins the moment the world moves on.</p><p>Formally, drift describes the divergence between the joint data distributions seen during training and those encountered in production:</p><div class="latex-rendered" data-attrs="{&quot;persistentExpression&quot;:&quot;P_{\\text{train}}(X, Y) \\neq P_{\\text{production}}(X, Y)&quot;,&quot;id&quot;:&quot;ERDKSXNWGH&quot;}" data-component-name="LatexBlockToDOM"></div><p>This joint distribution can be decomposed into two components:</p><div class="latex-rendered" data-attrs="{&quot;persistentExpression&quot;:&quot;P(X, y) = P(y \\mid X) \\cdot P(X)&quot;,&quot;id&quot;:&quot;EBCWFEHDGQ&quot;}" data-component-name="LatexBlockToDOM"></div><p>So drift may occur in <strong>either</strong> the input distribution <code>P(X)</code> or the conditional relationship <code>P(y&#8739;X)</code>, or both. This gives rise to the main categories of drift observed in machine learning systems.</p><h2>Types of Drift</h2><h3>Covariate Drift (Data Drift)</h3><p>The input distribution <code>P(X) </code>changes, while the relationship <code>P(y&#8739;X)</code> remains constant. Imagine a CCTV model trained on bright summer mornings in Oxford Street. By winter, the same scene has now changed. The objects haven&#8217;t changed, but the way they appear has. </p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!9CGx!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F59659628-8345-4fdb-bd7d-9cb845191f35_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!9CGx!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F59659628-8345-4fdb-bd7d-9cb845191f35_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!9CGx!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F59659628-8345-4fdb-bd7d-9cb845191f35_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!9CGx!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F59659628-8345-4fdb-bd7d-9cb845191f35_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!9CGx!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F59659628-8345-4fdb-bd7d-9cb845191f35_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!9CGx!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F59659628-8345-4fdb-bd7d-9cb845191f35_1024x1024.png" width="1024" height="1024" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/59659628-8345-4fdb-bd7d-9cb845191f35_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:2166094,&quot;alt&quot;:&quot;&quot;,&quot;title&quot;:&quot;&quot;,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/177460992?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F59659628-8345-4fdb-bd7d-9cb845191f35_1024x1024.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" title="" srcset="https://substackcdn.com/image/fetch/$s_!9CGx!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F59659628-8345-4fdb-bd7d-9cb845191f35_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!9CGx!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F59659628-8345-4fdb-bd7d-9cb845191f35_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!9CGx!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F59659628-8345-4fdb-bd7d-9cb845191f35_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!9CGx!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F59659628-8345-4fdb-bd7d-9cb845191f35_1024x1024.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><div class="pullquote"><p>That&#8217;s <strong>covariate drift</strong>, when the visuals change, but the meaning doesn&#8217;t.</p></div><p>A time-dependent version, known as <em><strong>temporal drift</strong></em>, happens when input data varies by hour, season, or event. Morning traffic in London looks nothing like midnight traffic. Unless the model encodes these rhythms, performance will fluctuate with time.</p><h3>Prior Probability Drift (Label Drift)</h3><p>The overall class or label distribution <code>P(y)</code> changes, even if the inputs look similar. In summer, London&#8217;s roads are full of cyclists<code> (y1)</code> and people<code> (y2)</code>. Your model is trained on this &#8220;normal&#8221; distribution<code> P_train(y)</code>. In winter, there are far fewer cyclists but a surge in delivery vans<code> (y3)</code>. The model&#8217;s idea of a &#8220;normal&#8221; scene is now wrong. It may become over-confident in predicting cyclists (its prior) and less accurate at identifying the newly common vans. This shift in balance is label drift.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!teHf!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f01af0e-96ba-47db-97d5-d0e407e38de6_1024x1024.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!teHf!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f01af0e-96ba-47db-97d5-d0e407e38de6_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!teHf!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f01af0e-96ba-47db-97d5-d0e407e38de6_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!teHf!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f01af0e-96ba-47db-97d5-d0e407e38de6_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!teHf!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f01af0e-96ba-47db-97d5-d0e407e38de6_1024x1024.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!teHf!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f01af0e-96ba-47db-97d5-d0e407e38de6_1024x1024.png" width="1024" height="1024" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/5f01af0e-96ba-47db-97d5-d0e407e38de6_1024x1024.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1024,&quot;width&quot;:1024,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:2055030,&quot;alt&quot;:&quot;&quot;,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/177460992?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f01af0e-96ba-47db-97d5-d0e407e38de6_1024x1024.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" title="" srcset="https://substackcdn.com/image/fetch/$s_!teHf!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f01af0e-96ba-47db-97d5-d0e407e38de6_1024x1024.png 424w, https://substackcdn.com/image/fetch/$s_!teHf!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f01af0e-96ba-47db-97d5-d0e407e38de6_1024x1024.png 848w, https://substackcdn.com/image/fetch/$s_!teHf!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f01af0e-96ba-47db-97d5-d0e407e38de6_1024x1024.png 1272w, https://substackcdn.com/image/fetch/$s_!teHf!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f01af0e-96ba-47db-97d5-d0e407e38de6_1024x1024.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>This is a critical factor for models sensitive to class imbalance.</p><div class="pullquote"><p>The frequency of <em>what</em> you are trying to find changes, even if the inputs and meanings stay the same.</p></div><h3>Concept Drift</h3><p>The mapping between inputs and outputs <code>P(y&#8739;X)</code> itself changes. The same input now implies a different meaning. A content moderation model is trained to <code>X</code> (a &#8220;thumbs up&#8221; gesture) as<code> y1</code> (&#8221;benign&#8221;). The company expands to new regions where this gesture is offensive. The human labelling team is now retrained to label the same <code>X</code> as <code>y2</code> (&#8221;offensive&#8221;). If the model isn&#8217;t retrained, it will fail, a victim of concept drift.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!wS8S!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F08e11231-d2d2-408c-a428-74440bfc81d6_3685x2739.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!wS8S!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F08e11231-d2d2-408c-a428-74440bfc81d6_3685x2739.png 424w, https://substackcdn.com/image/fetch/$s_!wS8S!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F08e11231-d2d2-408c-a428-74440bfc81d6_3685x2739.png 848w, https://substackcdn.com/image/fetch/$s_!wS8S!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F08e11231-d2d2-408c-a428-74440bfc81d6_3685x2739.png 1272w, https://substackcdn.com/image/fetch/$s_!wS8S!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F08e11231-d2d2-408c-a428-74440bfc81d6_3685x2739.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!wS8S!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F08e11231-d2d2-408c-a428-74440bfc81d6_3685x2739.png" width="1456" height="1082" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/08e11231-d2d2-408c-a428-74440bfc81d6_3685x2739.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1082,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:215211,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/178463103?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F08e11231-d2d2-408c-a428-74440bfc81d6_3685x2739.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!wS8S!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F08e11231-d2d2-408c-a428-74440bfc81d6_3685x2739.png 424w, https://substackcdn.com/image/fetch/$s_!wS8S!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F08e11231-d2d2-408c-a428-74440bfc81d6_3685x2739.png 848w, https://substackcdn.com/image/fetch/$s_!wS8S!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F08e11231-d2d2-408c-a428-74440bfc81d6_3685x2739.png 1272w, https://substackcdn.com/image/fetch/$s_!wS8S!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F08e11231-d2d2-408c-a428-74440bfc81d6_3685x2739.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Concept drift can occur in several ways:</p><ul><li><p><strong>Sudden Drift:</strong> An abrupt change in rules or context.</p></li><li><p><strong>Gradual Drift:</strong> A slow evolution over time.</p></li><li><p><strong>Incremental Drift:</strong> A sequence of small, cumulative changes.</p></li><li><p><strong>Recurring Drift:</strong> Old patterns re-emerge cyclically.</p></li></ul><div class="pullquote"><p><strong>Concept drift</strong> happens when the meaning of the data changes, but the model keeps seeing it the old way.</p></div><h3>Model Obsolescence &amp; Internal Factors</h3><p>Not all decay comes from data. Some arise within the model itself and are operational, not statistical. This isn&#8217;t drift in the strict sense, but staleness as the model stops keeping pace with a changing world. The clearest form is Algorithmic Obsolescence, where architectures or representations age out. A CNN, once state-of-the-art, may falter against newer vision transformers, not because the data changed, but because the model didn&#8217;t evolve. As data evolves, weights and hyperparameters tuned for past distributions gradually reflect drifts. This form of decay highlights that a model is an artefact of its time. </p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!ibBZ!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3c6136d0-30eb-462d-b9d6-568d2f64b1eb_4139x2328.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!ibBZ!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3c6136d0-30eb-462d-b9d6-568d2f64b1eb_4139x2328.png 424w, https://substackcdn.com/image/fetch/$s_!ibBZ!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3c6136d0-30eb-462d-b9d6-568d2f64b1eb_4139x2328.png 848w, https://substackcdn.com/image/fetch/$s_!ibBZ!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3c6136d0-30eb-462d-b9d6-568d2f64b1eb_4139x2328.png 1272w, https://substackcdn.com/image/fetch/$s_!ibBZ!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3c6136d0-30eb-462d-b9d6-568d2f64b1eb_4139x2328.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!ibBZ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3c6136d0-30eb-462d-b9d6-568d2f64b1eb_4139x2328.png" width="1456" height="819" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/3c6136d0-30eb-462d-b9d6-568d2f64b1eb_4139x2328.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:819,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:140796,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/178463103?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3c6136d0-30eb-462d-b9d6-568d2f64b1eb_4139x2328.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!ibBZ!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3c6136d0-30eb-462d-b9d6-568d2f64b1eb_4139x2328.png 424w, https://substackcdn.com/image/fetch/$s_!ibBZ!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3c6136d0-30eb-462d-b9d6-568d2f64b1eb_4139x2328.png 848w, https://substackcdn.com/image/fetch/$s_!ibBZ!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3c6136d0-30eb-462d-b9d6-568d2f64b1eb_4139x2328.png 1272w, https://substackcdn.com/image/fetch/$s_!ibBZ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3c6136d0-30eb-462d-b9d6-568d2f64b1eb_4139x2328.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>This is why continuous retraining pipelines and benchmark refresh cycles are critical MLOps practices, as they provide a mechanism not only to update data but also to upgrade the model&#8217;s architecture and training methods periodically</p><h1>Drift in Computer Vision Systems</h1><p>In computer vision, drift isn&#8217;t abstract, but it&#8217;s visible. A model&#8217;s perception of the world depends on light, texture, and hardware, so even subtle changes can disrupt its understanding. <strong>Data drift</strong> occurs when the pixels themselves change: new camera sensors or image processors alter colour balance and noise; compression or resizing in production reduces image fidelity; and seasonal light alters how edges and shadows are captured. The world looks the same to us but differently to the model.</p><p><strong>Concept drift</strong> is more deceptive. The pixels stay the same, but their meaning shifts. A product classifier for &#8220;shoes&#8221; may fail when confronted with new designs it was never trained on. A defect detector once accurate for &#8220;scratches&#8221; and &#8220;dents&#8221; may struggle with a new category, such as &#8220;micro-fractures.&#8221; In other cases, the ground truth itself evolves; for instance, content moderation models must constantly adapt as community standards redefine what is considered unsafe.</p><h1>The MLOps Playbook on How to Monitor Drift</h1><p>You can&#8217;t detect change without a baseline, and monitoring is not a passive activity but an active statistical process. rift detection begins with a reference. A baseline profile captures the statistical fingerprint of your training data with histograms, quantiles, mean, and variance for numeric features; category counts for discrete ones. For vision systems, this is the most critical step. Drift cannot be tracked at the pixel level; instead, you monitor:</p><ol><li><p><strong>Proxy Metrics:</strong> Distributions of low-level features like brightness, contrast, and sharpness.</p></li><li><p><strong>Embedding Distributions:</strong> The statistical profile (e.g., mean, covariance, or a learned density model) of embeddings extracted from a frozen backbone (e.g., ResNet or ViT).</p></li></ol><p>This baseline becomes the &#8220;ground truth&#8221; against which all future data is compared. With a baseline established, data drift monitoring uses statistical tests to assess how new data deviates from the baseline.</p><h2>Statistical Distance Methods (for Low-Dimensional Proxies)</h2><p>For 1D proxy metrics (like brightness) or categorical features, classical statistical tests are highly effective and explainable.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!_ENO!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff2527aba-52b6-4741-8260-3455e44ae707_1239x833.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!_ENO!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff2527aba-52b6-4741-8260-3455e44ae707_1239x833.png 424w, https://substackcdn.com/image/fetch/$s_!_ENO!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff2527aba-52b6-4741-8260-3455e44ae707_1239x833.png 848w, https://substackcdn.com/image/fetch/$s_!_ENO!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff2527aba-52b6-4741-8260-3455e44ae707_1239x833.png 1272w, https://substackcdn.com/image/fetch/$s_!_ENO!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff2527aba-52b6-4741-8260-3455e44ae707_1239x833.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!_ENO!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff2527aba-52b6-4741-8260-3455e44ae707_1239x833.png" width="1239" height="833" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/f2527aba-52b6-4741-8260-3455e44ae707_1239x833.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:833,&quot;width&quot;:1239,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:170539,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/178463103?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F40b42e06-7eda-4252-997c-a096e8b3501b_1240x924.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!_ENO!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff2527aba-52b6-4741-8260-3455e44ae707_1239x833.png 424w, https://substackcdn.com/image/fetch/$s_!_ENO!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff2527aba-52b6-4741-8260-3455e44ae707_1239x833.png 848w, https://substackcdn.com/image/fetch/$s_!_ENO!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff2527aba-52b6-4741-8260-3455e44ae707_1239x833.png 1272w, https://substackcdn.com/image/fetch/$s_!_ENO!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff2527aba-52b6-4741-8260-3455e44ae707_1239x833.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h4>High-Dimensional and Streaming Methods (for Embeddings)</h4><p>Classical tests lose power in high-dimensional embedding spaces. For this, more advanced methods are required.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!8MfI!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b6250d1-6e4d-48cb-91a2-545d557da427_1240x1046.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!8MfI!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b6250d1-6e4d-48cb-91a2-545d557da427_1240x1046.png 424w, https://substackcdn.com/image/fetch/$s_!8MfI!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b6250d1-6e4d-48cb-91a2-545d557da427_1240x1046.png 848w, https://substackcdn.com/image/fetch/$s_!8MfI!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b6250d1-6e4d-48cb-91a2-545d557da427_1240x1046.png 1272w, https://substackcdn.com/image/fetch/$s_!8MfI!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b6250d1-6e4d-48cb-91a2-545d557da427_1240x1046.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!8MfI!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b6250d1-6e4d-48cb-91a2-545d557da427_1240x1046.png" width="1240" height="1046" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/7b6250d1-6e4d-48cb-91a2-545d557da427_1240x1046.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1046,&quot;width&quot;:1240,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:228938,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/178463103?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b6250d1-6e4d-48cb-91a2-545d557da427_1240x1046.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!8MfI!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b6250d1-6e4d-48cb-91a2-545d557da427_1240x1046.png 424w, https://substackcdn.com/image/fetch/$s_!8MfI!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b6250d1-6e4d-48cb-91a2-545d557da427_1240x1046.png 848w, https://substackcdn.com/image/fetch/$s_!8MfI!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b6250d1-6e4d-48cb-91a2-545d557da427_1240x1046.png 1272w, https://substackcdn.com/image/fetch/$s_!8MfI!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7b6250d1-6e4d-48cb-91a2-545d557da427_1240x1046.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h2>Monitoring Concept Drift</h2><p>Concept drift is the hardest to monitor because it concerns <em>meaning</em>, not just data. Detecting it requires ground-truth labels, which in production are often delayed, sparse, or expensive. When labels aren&#8217;t immediately available, monitoring must rely on <strong>indirect signals. </strong></p><h3><strong>Without Labels (Indirect Monitoring)</strong></h3><p>In the absence of new labels, the most practical approach is to observe the model&#8217;s own behaviour. Track the distribution of predictions over time: if a model that once predicted <code>[80% A, 20% B]</code> now outputs [50% A, 50% B], its internal representation of the problem has shifted. Similarly, monitor confidence entropy; a rise in uncertainty or wider spread of confidence scores often precedes measurable accuracy decay. These indicators don&#8217;t prove drift, but they are early warnings that the relationship between inputs and outputs may have changed.</p><h3><strong>With Labels (Direct Monitoring)</strong></h3><p>When ground truth becomes available, drift detection becomes concrete. Track performance metric over rolling windows to reveal gradual degradation. To separate real decay from random noise, apply <strong>Statistical Process Control (SPC)</strong> techniques, such as&nbsp;<em>EWMA</em>&nbsp;or&nbsp;<em>CUSUM</em>&nbsp;charts, which flag statistically significant performance drops. The most reliable systems maintain a <strong>human-in-the-loop</strong>, continuously labelling a small, rotating sample of production data (typically 1&#8211;2 %). This rolling ground truth acts as a living benchmark, enabling early detection and timely retraining.</p><div class="pullquote"><p>In practice, effective concept-drift monitoring combines indirect statistical signals for speed with direct labelled feedback for certainty, closing the MLOps feedback loop between data, model, and meaning.</p></div><h1>Key Takeaways</h1><p>Building a high-performance model is not the finish line, but a starting point. The real challenge in production machine learning is keeping models relevant and resilient to the inevitability of change. Here are the key takeaways for building a drift-robust MLOps practice:</p><ul><li><p><strong>Monitoring Is Not Optional</strong>: The world will change, and your data will change with it. Continuous monitoring isn&#8217;t an add-on but a core requirement for any deployed model. Use a defence-in-depth strategy that includes statistical tests, embedding-level methods, and performance metrics.</p></li><li><p><strong>Detection Without Response Is Just Noise</strong>: Every alert must lead to action. Adaptation strategies fall into two complementary paths:</p><ul><li><p><strong>Data-Centric Mitigation:</strong> Fix the data. Use data augmentation to simulate new conditions (e.g., fog, lighting, sensor noise) or active learning to re-label uncertain or drifted samples.</p></li><li><p><strong>Model-Centric Mitigation:</strong> Fix the model. Choose full retraining for maximum reliability, incremental fine-tuning for speed, or online learning for continuous, real-time adaptation.</p></li></ul></li><li><p><strong>Close the Loop: </strong>Drift is an engineering problem; monitoring pipelines must be automated to trigger alerts, and those alerts must connect directly to governance and retraining workflows.</p></li><li><p><strong>Keep Recalibrating:</strong> Treat your model like a camera that must be constantly refocused and recalibrated to keep seeing the world clearly. The goal is not a <em>perfect</em> model, but a <em>resilient</em> one.</p></li></ul><h1>Further Reading</h1><p>&#8594; <a href="https://neptune.ai/blog/how-to-monitor-your-models-in-production-guide">Neptune AI: A Comprehensive Guide on How to Monitor Your Models in Production</a></p><p>&#8594; <a href="https://dcai.csail.mit.edu/2024/data-centric-model-centric/">MIT CSAIL: Data-Centric AI vs. Model-Centric AI</a></p><p>&#8594; <a href="https://www.amazon.co.uk/Practical-MLOps-Operationalizing-Machine-Learning/dp/1098103017">Practical MLOps by Noah Gift and Alfredo Deza</a></p><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient X Design! Subscribe for free to receive new posts</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><div class="pullquote"><p>Stay safe &#128154;</p></div><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/when-machine-learning-models-stop/comments&quot;,&quot;text&quot;:&quot;Leave a comment&quot;,&quot;action&quot;:null,&quot;class&quot;:null}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/when-machine-learning-models-stop/comments"><span>Leave a comment</span></a></p><p></p>]]></content:encoded></item><item><title><![CDATA[Concepts of Design Assurance for Neural Networks (CoDANN)]]></title><description><![CDATA[Building Trust in AI for Flight Systems]]></description><link>https://resilient.safeintelligence.ai/p/concepts-of-design-assurance-for</link><guid isPermaLink="false">https://resilient.safeintelligence.ai/p/concepts-of-design-assurance-for</guid><dc:creator><![CDATA[Brain Aboze]]></dc:creator><pubDate>Tue, 21 Oct 2025 07:02:35 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!57dJ!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<blockquote><p>For decades, aviation software has been built on the principle of deterministic logic. Give the system input A, and it will always produce output B. This predictability is the bedrock of aviation safety certification, where every possible path can be tested, verified, and traced back to a requirement. Artificial intelligence breaks that contract. Modern neural networks have shown remarkable skill in vision and perception tasks relevant to flight, from obstacle detection to runway tracking, but their performance gains come at a cost, &#8220;<strong>complexity</strong>&#8220;. As these models grow deeper and more data-driven, their inner workings become harder to verify, explain, or even fully reproduce. </p></blockquote><p>This creates what regulators now call the <strong>&#8220;assurance gap.&#8221;</strong> How can agencies like the <a href="https://www.faa.gov/">Federal Aviation Administration (FAA)</a> or the <a href="https://www.easa.europa.eu/en">European Union Aviation Safety Agency (EASA)</a> certify a system whose decision-making can&#8217;t be perfectly predicted or explained?</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!57dJ!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!57dJ!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png 424w, https://substackcdn.com/image/fetch/$s_!57dJ!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png 848w, https://substackcdn.com/image/fetch/$s_!57dJ!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png 1272w, https://substackcdn.com/image/fetch/$s_!57dJ!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!57dJ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png" width="1456" height="993" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:993,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:6894533,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/176340670?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!57dJ!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png 424w, https://substackcdn.com/image/fetch/$s_!57dJ!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png 848w, https://substackcdn.com/image/fetch/$s_!57dJ!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png 1272w, https://substackcdn.com/image/fetch/$s_!57dJ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F760f8e83-2dd9-4a9e-8875-d8d84703846b_3168x2160.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>This is the exact problem that the <strong>Co</strong>ncepts of <strong>D</strong>esign <strong>A</strong>ssurance for <strong>N</strong>eural <strong>N</strong>etworks (CoDANN) framework, a foundational project by <a href="https://www.easa.europa.eu/en">(EASA)</a> and <a href="https://www.daedalean.ai/">Daedalean</a> were designed to solve. Instead of trying to verify the final, opaque neural network, the framework&#8217;s breakthrough was to shift the focus to certifying the entire process that created it. This introduced a new methodology called Learning Assurance.</p><p>To appreciate this shift, we first need to look at the established standard for aviation software: the traditional <strong>V-shaped development cycle</strong>.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!xIyH!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F44ab7ac0-3863-4a7e-8ba6-f11c339375ff_3168x2160.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!xIyH!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F44ab7ac0-3863-4a7e-8ba6-f11c339375ff_3168x2160.png 424w, https://substackcdn.com/image/fetch/$s_!xIyH!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F44ab7ac0-3863-4a7e-8ba6-f11c339375ff_3168x2160.png 848w, https://substackcdn.com/image/fetch/$s_!xIyH!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F44ab7ac0-3863-4a7e-8ba6-f11c339375ff_3168x2160.png 1272w, https://substackcdn.com/image/fetch/$s_!xIyH!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F44ab7ac0-3863-4a7e-8ba6-f11c339375ff_3168x2160.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!xIyH!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F44ab7ac0-3863-4a7e-8ba6-f11c339375ff_3168x2160.png" width="1456" height="993" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/44ab7ac0-3863-4a7e-8ba6-f11c339375ff_3168x2160.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:993,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:126005,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/176340670?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F44ab7ac0-3863-4a7e-8ba6-f11c339375ff_3168x2160.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!xIyH!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F44ab7ac0-3863-4a7e-8ba6-f11c339375ff_3168x2160.png 424w, https://substackcdn.com/image/fetch/$s_!xIyH!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F44ab7ac0-3863-4a7e-8ba6-f11c339375ff_3168x2160.png 848w, https://substackcdn.com/image/fetch/$s_!xIyH!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F44ab7ac0-3863-4a7e-8ba6-f11c339375ff_3168x2160.png 1272w, https://substackcdn.com/image/fetch/$s_!xIyH!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F44ab7ac0-3863-4a7e-8ba6-f11c339375ff_3168x2160.png 1456w" sizes="100vw"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>This model is the perfect picture of deterministic engineering that moves downwards through requirements, design, and implementation and then climbs back up through testing and validation. This framework gives aviation software provable traceability where every requirement is explicitly validated by a corresponding test, and the respective implementation traces back to design decisions. </p><p>&#8220;The V-model&#8217;s logic breaks down where the &#8216;Implementation&#8217; phase is no longer a human writing explicit, verifiable rules, but a learning system (neural network) that learns implicit patterns from data, creating a system whose logic is not fixed and can evolve as the data itself shifts over time. This phase now consists of the dataset and the training algorithm, two elements that the V-model was never designed to handle. To handle that, CoDANN didn&#8217;t discard the V model but rather expanded it into a W model.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!XdXx!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F63003fce-8416-4ee4-9be6-883bcc663ac8_3168x2160.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!XdXx!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F63003fce-8416-4ee4-9be6-883bcc663ac8_3168x2160.png 424w, https://substackcdn.com/image/fetch/$s_!XdXx!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F63003fce-8416-4ee4-9be6-883bcc663ac8_3168x2160.png 848w, https://substackcdn.com/image/fetch/$s_!XdXx!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F63003fce-8416-4ee4-9be6-883bcc663ac8_3168x2160.png 1272w, https://substackcdn.com/image/fetch/$s_!XdXx!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F63003fce-8416-4ee4-9be6-883bcc663ac8_3168x2160.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!XdXx!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F63003fce-8416-4ee4-9be6-883bcc663ac8_3168x2160.png" width="1456" height="993" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/63003fce-8416-4ee4-9be6-883bcc663ac8_3168x2160.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:993,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:136252,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/176340670?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F63003fce-8416-4ee4-9be6-883bcc663ac8_3168x2160.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!XdXx!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F63003fce-8416-4ee4-9be6-883bcc663ac8_3168x2160.png 424w, https://substackcdn.com/image/fetch/$s_!XdXx!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F63003fce-8416-4ee4-9be6-883bcc663ac8_3168x2160.png 848w, https://substackcdn.com/image/fetch/$s_!XdXx!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F63003fce-8416-4ee4-9be6-883bcc663ac8_3168x2160.png 1272w, https://substackcdn.com/image/fetch/$s_!XdXx!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F63003fce-8416-4ee4-9be6-883bcc663ac8_3168x2160.png 1456w" sizes="100vw"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Here&#8217;s what each stage means in the W-model:</p><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/1P0vg/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/fd0248c6-48f8-448b-a8b4-a33437c57432_1220x2318.png&quot;,&quot;thumbnail_url_full&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/851e8cf7-4926-44cf-99a1-155b3d10fd85_1220x2318.png&quot;,&quot;height&quot;:1149,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/1P0vg/1/" width="730" height="1149" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><h1>Evolution of CoDANN</h1><p>The <a href="https://www.easa.europa.eu/en/document-library/general-publications/concepts-design-assurance-neural-networks-codann">CoDANN (2020)</a> and its W-model provided the foundational strategy (certify the process, not just the product). Its 2021 successor, <a href="https://www.easa.europa.eu/en/document-library/general-publications/concepts-design-assurance-neural-networks-codann-ii">CoDANN II</a>, immediately extended this framework to practical deployment, tackling the challenges of implementation and inference assurance (like quantisation, hardware mapping, and timing analysis) while also formally adding Explainability, system integration, and robust runtime (Out-of-Distribution) monitoring as core pillars of a trustworthy AI.</p><p>Europe&#8217;s Path via the EASA formally adopted these concepts in its <a href="https://www.easa.europa.eu/en/document-library/general-publications/easa-artificial-intelligence-roadmap-20">AI Roadmap 2.0 (2023)</a>, making Learning Assurance a core pillar of its official strategy. The goal is pragmatic by using existing safety frameworks, but add the new ML-specific processes and evidence required to scale from simple assistance to higher autonomy. </p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!uFz8!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc384a799-453d-40df-a798-965799ec3787_1040x557.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!uFz8!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc384a799-453d-40df-a798-965799ec3787_1040x557.png 424w, https://substackcdn.com/image/fetch/$s_!uFz8!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc384a799-453d-40df-a798-965799ec3787_1040x557.png 848w, https://substackcdn.com/image/fetch/$s_!uFz8!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc384a799-453d-40df-a798-965799ec3787_1040x557.png 1272w, https://substackcdn.com/image/fetch/$s_!uFz8!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc384a799-453d-40df-a798-965799ec3787_1040x557.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!uFz8!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc384a799-453d-40df-a798-965799ec3787_1040x557.png" width="1040" height="557" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/c384a799-453d-40df-a798-965799ec3787_1040x557.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:557,&quot;width&quot;:1040,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:134086,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/176340670?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc384a799-453d-40df-a798-965799ec3787_1040x557.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!uFz8!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc384a799-453d-40df-a798-965799ec3787_1040x557.png 424w, https://substackcdn.com/image/fetch/$s_!uFz8!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc384a799-453d-40df-a798-965799ec3787_1040x557.png 848w, https://substackcdn.com/image/fetch/$s_!uFz8!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc384a799-453d-40df-a798-965799ec3787_1040x557.png 1272w, https://substackcdn.com/image/fetch/$s_!uFz8!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc384a799-453d-40df-a798-965799ec3787_1040x557.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Snippet of EASA AI Roadmap 2.0, <a href="https://www.easa.europa.eu/en/document-library/general-publications/easa-artificial-intelligence-roadmap-20">EASA</a></figcaption></figure></div><p>The US Path via the <strong><a href="https://www.faa.gov/media/82891">FAA Roadmap for AI Safety Assurance (2024) </a></strong>followed suit. It sets out the principles for adopting AI within the existing certification scaffolding, emphasising incremental approvals, human-in-the-loop operation, and lifecycle assurance that covers both design-time and operation-time.</p><div class="pullquote"><p>While each regulator has its own roadmap, their destination is the same. Both agree that there can be no widespread adoption of AI without a new, rigorous body of scientific proof or quantified assurance evidence</p></div><p>To bridge national frameworks, <strong><a href="https://standardsworks.sae.org/standards-committees/g-34-artificial-intelligence-aviation">SAE</a> G-34</strong> and <strong><a href="https://www.eurocae.net/">EUROCAE</a> WG-114</strong> are developing the first globally harmonised AI assurance rulebook:<a href="https://na-admin.eventscloud.com/file_uploads/115fca49330a77ce92d7fe04e9874faf_Day1-Jahn-202508ED-324ARP6983presFAAAI-MLTechExchangeMeeting_8-5-25-Read-Only.pdf"> </a><strong><a href="https://na-admin.eventscloud.com/file_uploads/115fca49330a77ce92d7fe04e9874faf_Day1-Jahn-202508ED-324ARP6983presFAAAI-MLTechExchangeMeeting_8-5-25-Read-Only.pdf">AS 6983 / ED 324</a> </strong>(Development and Assurance guidelines for Aeronautical Systems and Equipment Implemented with Machine Learning). The forthcoming standard transforms the W-model into an actionable compliance checklist, specifying how to produce certifiable evidence for data management, training, model verification, configuration control, and runtime assurance.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!MdId!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F67c9c2b6-2e47-408e-803d-2fddecfd898e_1049x662.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!MdId!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F67c9c2b6-2e47-408e-803d-2fddecfd898e_1049x662.png 424w, https://substackcdn.com/image/fetch/$s_!MdId!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F67c9c2b6-2e47-408e-803d-2fddecfd898e_1049x662.png 848w, https://substackcdn.com/image/fetch/$s_!MdId!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F67c9c2b6-2e47-408e-803d-2fddecfd898e_1049x662.png 1272w, https://substackcdn.com/image/fetch/$s_!MdId!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F67c9c2b6-2e47-408e-803d-2fddecfd898e_1049x662.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!MdId!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F67c9c2b6-2e47-408e-803d-2fddecfd898e_1049x662.png" width="1049" height="662" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/67c9c2b6-2e47-408e-803d-2fddecfd898e_1049x662.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:662,&quot;width&quot;:1049,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:149486,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/176340670?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F67c9c2b6-2e47-408e-803d-2fddecfd898e_1049x662.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!MdId!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F67c9c2b6-2e47-408e-803d-2fddecfd898e_1049x662.png 424w, https://substackcdn.com/image/fetch/$s_!MdId!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F67c9c2b6-2e47-408e-803d-2fddecfd898e_1049x662.png 848w, https://substackcdn.com/image/fetch/$s_!MdId!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F67c9c2b6-2e47-408e-803d-2fddecfd898e_1049x662.png 1272w, https://substackcdn.com/image/fetch/$s_!MdId!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F67c9c2b6-2e47-408e-803d-2fddecfd898e_1049x662.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption"><strong>ARP 6983 / ED-324</strong> objectives example table, <a href="https://na.eventscloud.com/file_uploads/115fca49330a77ce92d7fe04e9874faf_Day1-Jahn-202508ED-324ARP6983presFAAAI-MLTechExchangeMeeting_8-5-25-Read-Only.pdf">SAE INTERNATIONAL &amp; EUROCAE</a></figcaption></figure></div><p>To illustrate what that looks like in practice, the draft ARP 6983 / ED-324 defines explicit objectives for every phase of the <em>Machine Learning Development Lifecycle (MLDL). </em>Each assurance level has its own set of mandatory objectives and  distinct assurance checkpoints.</p><h1>Beyond the Checklist</h1><p>The AS 6983 standard provides the compliance checklist, but how do engineers actually prove a complex, probabilistic neural network is safe? This is achieved through a multi-layered defence:</p><ol><li><p><strong>Formal Verification</strong>: This is where the strategy shifts from exhaustive testing to mathematical proof that the model satisfies specific safety rules. It&#8217;s impossible to test for every single thing the AI might see, such as every possible weather condition, every type of runway, and every time of day. Instead of just trying to find failures, engineers now prove their absence and find counterexample, read more about<a href="https://resilient.safeintelligence.ai/p/formal-verification-of-ml"> formal verification here &#8594;</a>.</p></li><li><p><strong>Overarching Properties (OPs)</strong>: A concept pioneered by <a href="https://shemesh.larc.nasa.gov/arg/utops.pdf">NASA</a>, OPs take a pragmatic approach. Rather than proving everything about the AI, you prove a small set of unbreakable safety boundaries. <em>For example, the aircraft will always maintain a safe distance from another aircraft</em>. These rules provide a rock-solid, provable safety guarantee, no matter what the complex AI model thinks it&#8217;s seeing.</p></li><li><p><strong>Runtime Assurance (RTA)</strong>: This is the system&#8217;s active safety net that operates <em>during</em> the flight like a <strong>co-pilot that never improvises. </strong>It assumes that despite the best design, the AI might still encounter a bizarre edge case. A simple, verified controller (the <em>Checker</em>) runs in parallel with the complex AI (the <em>Doer</em>). The AI can make high-level decisions and every command it sends is screened by the Checker. If a command ever looks unsafe or uncertain, say the <em>AI suggests a turn that&#8217;s too sharp or a descent that exceeds its limits</em>, the Checker instantly blocks it and replaces it with a safe, pre-approved manoeuvre. This design means that even if the AI encounters something it was never trained on, the aircraft always stays inside a mathematically defined safety envelope.</p></li></ol><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!z3QP!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ef943dd-d814-4b3f-857a-478b92202c66_1238x650.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!z3QP!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ef943dd-d814-4b3f-857a-478b92202c66_1238x650.png 424w, https://substackcdn.com/image/fetch/$s_!z3QP!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ef943dd-d814-4b3f-857a-478b92202c66_1238x650.png 848w, https://substackcdn.com/image/fetch/$s_!z3QP!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ef943dd-d814-4b3f-857a-478b92202c66_1238x650.png 1272w, https://substackcdn.com/image/fetch/$s_!z3QP!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ef943dd-d814-4b3f-857a-478b92202c66_1238x650.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!z3QP!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ef943dd-d814-4b3f-857a-478b92202c66_1238x650.png" width="1238" height="650" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/4ef943dd-d814-4b3f-857a-478b92202c66_1238x650.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:650,&quot;width&quot;:1238,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:94232,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/176340670?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ef943dd-d814-4b3f-857a-478b92202c66_1238x650.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!z3QP!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ef943dd-d814-4b3f-857a-478b92202c66_1238x650.png 424w, https://substackcdn.com/image/fetch/$s_!z3QP!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ef943dd-d814-4b3f-857a-478b92202c66_1238x650.png 848w, https://substackcdn.com/image/fetch/$s_!z3QP!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ef943dd-d814-4b3f-857a-478b92202c66_1238x650.png 1272w, https://substackcdn.com/image/fetch/$s_!z3QP!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ef943dd-d814-4b3f-857a-478b92202c66_1238x650.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Runtime Assurance architecture pattern, <a href="https://ntrs.nasa.gov/api/citations/20220015734/downloads/tm-rta-guidance.pdf">NASA</a></figcaption></figure></div><p>These proven properties and assurance methods are then all assembled into a <strong>formal safety case (</strong>a structured, auditable argument), supported by evidence, that the entire system is acceptably safe.</p><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient X Design! Subscribe for free to receive new posts</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><div class="pullquote"><p>All these initiatives are converging toward a single global vision that trust built through transparency, process, and proof. Certification is no longer a one-time hurdle but a continuous assurance loop, tracing safety from data to decision to deployment.</p><p>Stay safe &#128154;  and here&#8217;s to intelligence that can safely evolve in the sky.</p></div><p></p><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/concepts-of-design-assurance-for/comments&quot;,&quot;text&quot;:&quot;Leave a comment&quot;,&quot;action&quot;:null,&quot;class&quot;:null}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/concepts-of-design-assurance-for/comments"><span>Leave a comment</span></a></p><p></p>]]></content:encoded></item><item><title><![CDATA[AI Engineer Paris 2025 Recap]]></title><description><![CDATA[Through the Lens of Trustworthy AI]]></description><link>https://resilient.safeintelligence.ai/p/ai-engineer-paris-2025-recap</link><guid isPermaLink="false">https://resilient.safeintelligence.ai/p/ai-engineer-paris-2025-recap</guid><dc:creator><![CDATA[Brain Aboze]]></dc:creator><pubDate>Wed, 01 Oct 2025 12:33:09 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!oW46!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<blockquote><p>Just last month, on the 23rd-24th of September 2025, the atmosphere at the <strong><a href="https://www.ai.engineer/">AI Engineer (AIE)</a></strong>  Paris Expo was electric. This was the first edition outside the US, hosted at <strong><a href="https://stationf.co/">STATION F</a></strong> and organised with <strong><a href="https://www.koyeb.com/">Koyeb</a>.</strong> The event carried a very pragmatic energy,  with engineers focused on building and deploying the next generation of AI agents, tools, and infrastructure. But beyond the demos and keynotes, as I moved between the sessions I attended, a deeper personal theme emerged: <em><strong>the future isn&#8217;t just about building more intelligent AI, it&#8217;s about building more trustworthy AI.</strong></em></p></blockquote><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!oW46!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!oW46!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png 424w, https://substackcdn.com/image/fetch/$s_!oW46!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png 848w, https://substackcdn.com/image/fetch/$s_!oW46!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png 1272w, https://substackcdn.com/image/fetch/$s_!oW46!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!oW46!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png" width="1456" height="1017" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1017,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:11645209,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/174913983?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!oW46!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png 424w, https://substackcdn.com/image/fetch/$s_!oW46!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png 848w, https://substackcdn.com/image/fetch/$s_!oW46!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png 1272w, https://substackcdn.com/image/fetch/$s_!oW46!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8b2905ca-b887-4c6a-9b21-f759df475768_6221x4347.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h1>My Key Takeaways</h1><h2>&#8594; <strong>Open Source as the Foundation of Trust</strong></h2><p><a href="https://www.linkedin.com/in/leliorenardlavaud/">L&#233;lio Renard Lavaud</a> (Mistral) reminded us that adoption at enterprise scale hinges on openness. By sidestepping vendor lock-in and surfacing transparency, open models create the reliability businesses demand. <a href="https://www.linkedin.com/in/sifre/">Laurent Sifre</a> (H Company) echoed this, framing open-source &#8220;bricks&#8221; as the future scaffolding for AI innovation.</p><div class="pullquote"><p>Trustworthy AI starts with open foundations, not closed walls.</p></div><h2>&#8594; <strong>Security and Safety by Design</strong></h2><p><a href="https://www.linkedin.com/in/jesus-espino/">Jes&#250;s Espino</a> (Ona) and <a href="https://www.linkedin.com/in/martinwoodward/">Martin Woodward</a> (GitHub) emphasised that trust cannot exist without security. Isolation, auditability, and reproducibility aren&#8217;t optional; they are essential, especially in regulated environments. Safe AI deployment protects both users and businesses, ensuring that agentic workflows remain reliable even under high stakes.</p><div class="pullquote"><p>Trustworthy AI requires safety and security as core principles, not afterthoughts.</p></div><h2>&#8594; <strong>Learning from Failure: Context, State, and Infrastructure</strong></h2><p><a href="https://www.linkedin.com/in/thomas-heinz-schmidt/">Thomas Schmidt </a>(Metabase) and <a href="https://www.linkedin.com/in/remilouf/">R&#233;mi Louf</a> (.txt) reminded us that agents still hallucinate metrics, misreport outputs, and occasionally break fundamental interactions. <a href="https://www.linkedin.com/in/emileifrem/">Emil Eifrem</a> (Neo4j) and talks from Spotify and Shopify highlighted the root cause: poor state and context management. Without clear context and robust infrastructure, agents misstep or stall. <a href="https://www.linkedin.com/in/yannleger/">Yann Leger</a> (Koyeb) emphasised that resilient, heterogeneous, agent-ready systems are essential to turn these failures into trustworthy performance.</p><div class="pullquote"><p>Trust begins by confronting failure and building the context and infrastructure that prevent it.</p></div><h2> &#8594; <strong>Evaluation and Trust in AI Behaviour</strong></h2><p>If you can&#8217;t measure it, you can&#8217;t trust it. <a href="https://www.linkedin.com/in/pierre-burgy-strapi-88671673/">Pierre Burgy</a> (Strapi), <a href="https://www.linkedin.com/in/sallyann-delucia-59a381172/">SallyAnn DeLucia</a> (Arize AI), and <a href="https://www.linkedin.com/in/srilakshmi-chavali/">Srilakshmi Chavali</a> highlighted a hard truth: static benchmarks often fail to reflect real-world reliability. Instead, emerging frameworks focus on session-level monitoring, conversations, and user &#8220;vibe&#8221; to systematically evaluate AI behaviour. Continuous feedback loops enable agents to learn and improve transparently, turning evaluation into a true measure of trustworthiness.</p><div class="pullquote"><p>Trust is earned through continuous, measurable evaluation, not just static metrics.</p></div><h2>&#8594; <strong>Efficiency, Sustainability, and Scale</strong></h2><p>From <a href="https://www.linkedin.com/in/bertrand-charpentier-76995ab6/">Bertrand Charpentier</a>&#8217;s deep dive into compression techniques to<a href="https://www.linkedin.com/in/steevemorin/"> Steeve Morin</a>&#8217;s sparse attention on CPUs, engineers are proving that trustworthy AI must also be <strong>efficient AI</strong>. Sustainability isn&#8217;t a side goal. It&#8217;s part of reliability, cost-effectiveness, and accessibility.</p><div class="pullquote"><p>T<em>rust is not just about correctness, it&#8217;s about efficiency and responsibility at scale.</em></p></div><h2>&#8594; <strong>Agents Moving from Promise to Practice</strong></h2><p>The event showed the inflexion point where agents stopped being hype and started showing their promise.</p><ul><li><p><strong>Codebase rewriting at scale</strong> (Spotify)</p></li><li><p><strong>Senior-level AI code review</strong> (Graphite)</p></li><li><p><strong>Agent swarms for refactoring</strong> (All Hands AI)</p></li><li><p><strong>Multi-agent orchestration</strong> (Docker)</p></li></ul><p>Each showed a vision where agents aren&#8217;t assistants, they&#8217;re teammates.</p><div class="pullquote"><p>Agents are no longer experiments; they&#8217;re workflows.</p></div><p>From the sessions I attended, my strongest impression is that <strong>the future of AI won&#8217;t be built on hype, but on a foundation of trust</strong>. The conference made it clear that trust is the result of a responsible engineering mindset.</p><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient X Design! Subscribe for free to receive new posts</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><h1>Looking Forward</h1><p>The AI Engineer community is just getting started, as the founders said, and from my own experience, you won&#8217;t want to miss what&#8217;s next. Mark your calendars and secure your tickets:</p><ul><li><p><strong>AI Engineer Code Summit</strong> | November 20&#8211;22, New York, NY</p></li><li><p><strong>AI Engineer Europe</strong> | April 7&#8211;10, London, UK</p></li><li><p><strong>AI Engineer World&#8217;s Fair 2026</strong> | June 30&#8211;July 2, San Francisco, CA</p></li></ul><p>I look forward to seeing you in the next one, <a href="https://www.ai.engineer/">know more</a>. </p><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/ai-engineer-paris-2025-recap/comments&quot;,&quot;text&quot;:&quot;Leave a comment&quot;,&quot;action&quot;:null,&quot;class&quot;:null}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/ai-engineer-paris-2025-recap/comments"><span>Leave a comment</span></a></p>]]></content:encoded></item><item><title><![CDATA[VNN-COMP: Benchmarking the Verification of Neural Networks]]></title><description><![CDATA[A competition that's building trust in AI]]></description><link>https://resilient.safeintelligence.ai/p/vnn-comp-benchmarking-the-verification</link><guid isPermaLink="false">https://resilient.safeintelligence.ai/p/vnn-comp-benchmarking-the-verification</guid><dc:creator><![CDATA[Brain Aboze]]></dc:creator><pubDate>Tue, 16 Sep 2025 09:15:26 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!doYX!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb3d2da69-8cf3-47cb-89e4-4bd020398116_1220x5110.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<p>In my <a href="https://safeintelai.substack.com/p/ml-benchmarking-primer">Benchmarking Primer</a>, I argued that benchmarking is a method for measuring, comparing, and improving machine learning systems. In <a href="https://safeintelai.substack.com/p/formal-verification-of-ml">Formal Verification of ML</a>, I discussed how verification enables us to mathematically prove (or disprove) that a machine learning model always satisfies certain critical properties, such as robustness, safety, and fairness, for all possible valid inputs, not just the ones tested. Together, they address two key aspects of the assurance puzzle: benchmarking reveals how well a system performs, while verification confirms what it guarantees.</p><blockquote><p>But there&#8217;s still a critical question: <em><strong>who verifies the verifiers?</strong></em></p></blockquote><p>That&#8217;s where the Verification of Neural Networks Competition (<strong>VNN-COMP)</strong> comes in. Think of VNN-COMP as where benchmarking discipline meets formal verification guarantees. Each year, the competition brings together verification tools from around the world and tests them under the same conditions: identical models, identical specifications, and identical compute budgets. The result is a clear, comparative picture of how today&#8217;s verification techniques perform in practice, cutting through theory and marketing claims to see which tools actually deliver. </p><h1>A short history of VNN-COMP (2020-2025)</h1><p>From a friendly kickoff to a flagship global competition, VNN-COMP has grown into the central benchmark for neural network verification. The following section covers key milestones per year, and a comparison table provides a high-level overview of the competition from its inception to the present day.</p><h2>2020</h2><p>Inaugural "friendly competition" to build community. Self-reported results on non-standard hardware. Three benchmark categories were considered across the ACAS Xu, MNIST, and CIFAR-10 datasets</p><div><hr></div><h2>2021 </h2><p>Rules were formalised, official rankings were introduced, and ONNX, along with VNN-LIB, became the standard model and specification formats. Scoring was strict but straightforward: +10 points for each correctly verified SAT/UNSAT instance, and a &#8211;100 penalty for any incorrect result. To ensure fairness, every tool was run on equal-cost AWS hardware, with teams choosing between a CPU-optimised or a GPU-optimised instance. Six benchmark categories were considered across ACAS Xu, MNIST, CIFAR-10 datasets and one benchmark on database indexing.</p><div><hr></div><h2>2022 </h2><p>Fully automated evaluation pipeline on AWS. Each benchmark had a total timeout of between three and six hours, with randomisation of instances being<br>mandatory this year, with tool tuning per benchmark level. The benchmark cut across six categories. Similar to the previous year, teams could choose from a range of AWS instance types, providing a focus on CPU, GPU, or a mixed combination.</p><div><hr></div><h2>2023</h2><p>Continued standardised ONNX/VNN-LIB and equal-cost AWS evaluation, while keeping automated pipeline &amp; standardised counter-example format. Required CE &#8220;witnesses&#8221; (input examples that verify a reported counter-example is real). Broader AWS instance choices, no time bonus and &#8211;150 penalty for incorrect result.</p><h2>2024</h2><p>Regular and extended tracks and continued ONNX/VNN-LIB format and equal-cost AWS. Fully automated installation and evaluation pipeline and required CE &#8220;witnesses&#8221; plus no time bonus and &#8211;150 penalty for incorrect results.</p><div><hr></div><h2>2025</h2><p>The official VNN-COMP 2025 report isn&#8217;t out yet, but Safe Intelligence was there. Read our takeaways &#128071;&#127996;</p><div class="captioned-image-container"><figure><a class="image-link image2" target="_blank" href="https://safeintelligence.ai/safe-intelligence-in-zagreb-our-highlights-from-the-2025-symposium-on-ai-verification-saiv/" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!DELN!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4890a59c-85cf-4b01-a057-41688e7f5637_1257x288.png 424w, https://substackcdn.com/image/fetch/$s_!DELN!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4890a59c-85cf-4b01-a057-41688e7f5637_1257x288.png 848w, https://substackcdn.com/image/fetch/$s_!DELN!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4890a59c-85cf-4b01-a057-41688e7f5637_1257x288.png 1272w, https://substackcdn.com/image/fetch/$s_!DELN!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4890a59c-85cf-4b01-a057-41688e7f5637_1257x288.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!DELN!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4890a59c-85cf-4b01-a057-41688e7f5637_1257x288.png" width="1257" height="288" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/4890a59c-85cf-4b01-a057-41688e7f5637_1257x288.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:288,&quot;width&quot;:1257,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:758141,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:&quot;https://safeintelligence.ai/safe-intelligence-in-zagreb-our-highlights-from-the-2025-symposium-on-ai-verification-saiv/&quot;,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/172097829?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4890a59c-85cf-4b01-a057-41688e7f5637_1257x288.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!DELN!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4890a59c-85cf-4b01-a057-41688e7f5637_1257x288.png 424w, https://substackcdn.com/image/fetch/$s_!DELN!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4890a59c-85cf-4b01-a057-41688e7f5637_1257x288.png 848w, https://substackcdn.com/image/fetch/$s_!DELN!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4890a59c-85cf-4b01-a057-41688e7f5637_1257x288.png 1272w, https://substackcdn.com/image/fetch/$s_!DELN!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4890a59c-85cf-4b01-a057-41688e7f5637_1257x288.png 1456w" sizes="100vw" loading="lazy"></picture><div></div></div></a></figure></div><div class="pullquote"><p>Let's have a quick overview of the history.</p></div><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/JocVa/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/b3d2da69-8cf3-47cb-89e4-4bd020398116_1220x5110.png&quot;,&quot;thumbnail_url_full&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/2c6dcd45-0e88-43a6-8590-7d224de7df71_1220x5110.png&quot;,&quot;height&quot;:2617,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/JocVa/1/" width="730" height="2617" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><p> From the friendly kickoff in 20220 to today's flagship status,  VNN-COMP has become the rigorous and automated proving ground for neural network verification itself. It&#8217;s where theory meets practice and where trust in AI has to prove itself, because benchmarking alone isn&#8217;t enough, and verification without verification is a paradox. </p><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient X Design! Subscribe for free to receive new posts.</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><p></p><h1>Further Reading</h1><ul><li><p><a href="https://arxiv.org/abs/2301.05815">First Three Years of the International Verification of Neural Networks Competition (VNN-COMP)</a></p></li><li><p><a href="https://arxiv.org/abs/2312.16760">The Fourth International Verification of Neural Networks Competition (VNN-COMP 2023): Summary and Results</a></p></li><li><p><a href="https://arxiv.org/abs/2412.19985">The Fifth International Verification of Neural Networks Competition (VNN-COMP 2024): Summary and Results</a></p></li><li><p><a href="https://sites.google.com/view/vnn2025">About VNN 2025: 6th International Verification of Neural Networks Competition (VNN-COMP'25)</a></p></li></ul>]]></content:encoded></item><item><title><![CDATA[Deploying Machine Learning Models]]></title><description><![CDATA[From Validation to Production]]></description><link>https://resilient.safeintelligence.ai/p/deploying-machine-learning-models</link><guid isPermaLink="false">https://resilient.safeintelligence.ai/p/deploying-machine-learning-models</guid><dc:creator><![CDATA[Brain Aboze]]></dc:creator><pubDate>Tue, 12 Aug 2025 07:00:56 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!-it_!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F93ca9525-b763-4d82-a7cd-567ee559e562_1232x928.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!-it_!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F93ca9525-b763-4d82-a7cd-567ee559e562_1232x928.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!-it_!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F93ca9525-b763-4d82-a7cd-567ee559e562_1232x928.png 424w, https://substackcdn.com/image/fetch/$s_!-it_!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F93ca9525-b763-4d82-a7cd-567ee559e562_1232x928.png 848w, https://substackcdn.com/image/fetch/$s_!-it_!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F93ca9525-b763-4d82-a7cd-567ee559e562_1232x928.png 1272w, https://substackcdn.com/image/fetch/$s_!-it_!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F93ca9525-b763-4d82-a7cd-567ee559e562_1232x928.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!-it_!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F93ca9525-b763-4d82-a7cd-567ee559e562_1232x928.png" width="728" height="548.3636363636364" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/93ca9525-b763-4d82-a7cd-567ee559e562_1232x928.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:false,&quot;imageSize&quot;:&quot;normal&quot;,&quot;height&quot;:928,&quot;width&quot;:1232,&quot;resizeWidth&quot;:728,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:&quot;center&quot;,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!-it_!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F93ca9525-b763-4d82-a7cd-567ee559e562_1232x928.png 424w, https://substackcdn.com/image/fetch/$s_!-it_!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F93ca9525-b763-4d82-a7cd-567ee559e562_1232x928.png 848w, https://substackcdn.com/image/fetch/$s_!-it_!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F93ca9525-b763-4d82-a7cd-567ee559e562_1232x928.png 1272w, https://substackcdn.com/image/fetch/$s_!-it_!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F93ca9525-b763-4d82-a7cd-567ee559e562_1232x928.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h1><strong>TL;DR</strong></h1><p><em>In our last article, we moved beyond conventional testing and introduced formal verification, a deeper form of validation. Now that your model has cleared every validation gate, the next challenge is to deliver it. This article covers choosing the right inference architectures and walks through progressive rollout strategies that aim to minimise risk in high-stakes industries. Lastly, it distils the industry consensus on scaling, which involves standardising, containerising, and orchestrating to handle operational complexities at scale.</em></p><h1><strong>Understanding your deployment needs</strong></h1><p>Before shipping a model to production, step back and map production requirements to the right serving pattern. Four levers drive this decision:</p><ol><li><p>Latency &amp; Throughput: How fast must each prediction return and at what volume? These directly dictate the speed and capacity demands on your system.</p></li></ol><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!QDHr!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2260eaf5-cd19-41bb-828e-198df09d18aa_1553x604.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!QDHr!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2260eaf5-cd19-41bb-828e-198df09d18aa_1553x604.png 424w, https://substackcdn.com/image/fetch/$s_!QDHr!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2260eaf5-cd19-41bb-828e-198df09d18aa_1553x604.png 848w, https://substackcdn.com/image/fetch/$s_!QDHr!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2260eaf5-cd19-41bb-828e-198df09d18aa_1553x604.png 1272w, https://substackcdn.com/image/fetch/$s_!QDHr!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2260eaf5-cd19-41bb-828e-198df09d18aa_1553x604.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!QDHr!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2260eaf5-cd19-41bb-828e-198df09d18aa_1553x604.png" width="1456" height="566" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/2260eaf5-cd19-41bb-828e-198df09d18aa_1553x604.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:566,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!QDHr!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2260eaf5-cd19-41bb-828e-198df09d18aa_1553x604.png 424w, https://substackcdn.com/image/fetch/$s_!QDHr!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2260eaf5-cd19-41bb-828e-198df09d18aa_1553x604.png 848w, https://substackcdn.com/image/fetch/$s_!QDHr!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2260eaf5-cd19-41bb-828e-198df09d18aa_1553x604.png 1272w, https://substackcdn.com/image/fetch/$s_!QDHr!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2260eaf5-cd19-41bb-828e-198df09d18aa_1553x604.png 1456w" sizes="100vw"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><ol start="2"><li><p>Data dynamics: When does input data arrive, and how fresh must predictions be?</p></li><li><p>Resource constraints &amp; scalability: Available computing power (CPU/GPU, memory) and the infrastructure's ability to scale up or down with varying loads.</p></li><li><p>Cost &amp; Interaction Model Who/what consumes the predictions, and how price-sensitive is the workload?</p></li></ol><h1><strong>Core ML Serving Architectures</strong></h1><p>With the key criteria in mind, ML models are generally deployed in three main architecture patterns:</p><ol><li><p>Online Real-Time Inference (Synchronous)</p></li><li><p>Asynchronous Inference (Near Real-Time)</p></li><li><p>Offline Batch Prediction (Batch Transform)</p></li></ol><p>Each of these has its strengths and ideal use cases. Sometimes different terminology is used (for example, &#8220;batch prediction&#8221; vs. &#8220;offline batch transform&#8221; or grouping the first two as both &#8220;online&#8221; methods, synchronous vs. asynchronous), but the concepts remain consistent. Let&#8217;s explore each architecture.</p><h2><strong>Online Real-Time Inference (Synchronous)</strong></h2><p>This architecture provides immediate prediction responses with minimal latency. Applications that prioritise speed or interactive responses, like fraud detection at transaction time<strong> </strong>or cockpit alert systems, find this architecture ideal. Here, the model is served via an API that the users call <strong>synchronously</strong> (the client sends a request/input and waits immediately for the prediction result). When a request arrives, such as a JSON payload of features, the service loads the model, performs inference, and immediately returns the result. Everything happens within the context of that request, usually in a matter of milliseconds or seconds, as illustrated below:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!x3o7!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb0e54c27-4fc4-4643-a445-f0e877dab616_1575x1182.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!x3o7!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb0e54c27-4fc4-4643-a445-f0e877dab616_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!x3o7!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb0e54c27-4fc4-4643-a445-f0e877dab616_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!x3o7!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb0e54c27-4fc4-4643-a445-f0e877dab616_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!x3o7!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb0e54c27-4fc4-4643-a445-f0e877dab616_1575x1182.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!x3o7!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb0e54c27-4fc4-4643-a445-f0e877dab616_1575x1182.png" width="1456" height="1093" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/b0e54c27-4fc4-4643-a445-f0e877dab616_1575x1182.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1093,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!x3o7!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb0e54c27-4fc4-4643-a445-f0e877dab616_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!x3o7!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb0e54c27-4fc4-4643-a445-f0e877dab616_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!x3o7!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb0e54c27-4fc4-4643-a445-f0e877dab616_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!x3o7!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb0e54c27-4fc4-4643-a445-f0e877dab616_1575x1182.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>The system is typically optimised to minimise overhead in response. That might involve keeping models loaded in memory, using fast data stores for features, and even making models lighter or distilling them to run faster. The price of that speed is scalability, such as with more instances or powerful hardware used. Additionally, there is a limit to the throughput achievable with a purely synchronous design; if the request volume exceeds what your servers can handle at the same time, either latency will increase or some requests may fail.</p><h2><strong>Asynchronous Inference (Near Real-Time)</strong></h2><p>This architecture addresses the bottleneck of synchronous inference by introducing a message queue between your client and the ML model. The requests are buffered (to absorb bursts), and then stateless workers pull jobs, run inference, and write results to a result store. The client is either alerted or polls a results endpoint to fetch the prediction. Since requests buffer, workers can scale horizontally or even down to zero when idle. The client is either alerted or polls a results endpoint to fetch the prediction. This architecture is illustrated below:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!eULt!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8a21aa1-62b7-48fe-900a-4b8d4483ae6e_1575x1182.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!eULt!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8a21aa1-62b7-48fe-900a-4b8d4483ae6e_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!eULt!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8a21aa1-62b7-48fe-900a-4b8d4483ae6e_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!eULt!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8a21aa1-62b7-48fe-900a-4b8d4483ae6e_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!eULt!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8a21aa1-62b7-48fe-900a-4b8d4483ae6e_1575x1182.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!eULt!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8a21aa1-62b7-48fe-900a-4b8d4483ae6e_1575x1182.png" width="1456" height="1093" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/a8a21aa1-62b7-48fe-900a-4b8d4483ae6e_1575x1182.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1093,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!eULt!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8a21aa1-62b7-48fe-900a-4b8d4483ae6e_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!eULt!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8a21aa1-62b7-48fe-900a-4b8d4483ae6e_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!eULt!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8a21aa1-62b7-48fe-900a-4b8d4483ae6e_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!eULt!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa8a21aa1-62b7-48fe-900a-4b8d4483ae6e_1575x1182.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>If your latency target is seconds, not sub-100ms, async serving smooths out traffic spikes and converts them into steady, predictable costs while preserving reliability and throughput. Lastly, this architecture provides the avenue to insert preprocessing, enrichment, or post-processing stages directly in the queue flow.</p><h2><strong>Offline Batch Prediction (Batch Transform)</strong></h2><p>Offline batch prediction refers to running the model against a dataset slice or entire warehouse on a schedule or trigger and writing the outputs to storage, a data warehouse, or a feature store for downstream use. These batch jobs often run on big data processing frameworks or distributed systems (like Apache Spark, Hadoop, or distributed SQL engines) to churn through huge volumes of input.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!8DOF!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6501df81-0352-41a7-a66f-ca834d9e0b85_1575x1182.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!8DOF!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6501df81-0352-41a7-a66f-ca834d9e0b85_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!8DOF!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6501df81-0352-41a7-a66f-ca834d9e0b85_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!8DOF!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6501df81-0352-41a7-a66f-ca834d9e0b85_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!8DOF!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6501df81-0352-41a7-a66f-ca834d9e0b85_1575x1182.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!8DOF!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6501df81-0352-41a7-a66f-ca834d9e0b85_1575x1182.png" width="1456" height="1093" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/6501df81-0352-41a7-a66f-ca834d9e0b85_1575x1182.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1093,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!8DOF!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6501df81-0352-41a7-a66f-ca834d9e0b85_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!8DOF!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6501df81-0352-41a7-a66f-ca834d9e0b85_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!8DOF!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6501df81-0352-41a7-a66f-ca834d9e0b85_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!8DOF!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6501df81-0352-41a7-a66f-ca834d9e0b85_1575x1182.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>By exploiting spot instances and off-peak capacity, you slash costs, and because compute is ephemeral, you never pay for idle GPUs. The catch? Results arrive in minutes or hours, so this is great for back-office analytics but ill-suited to user-facing paths.</p><h2><strong>Which architecture fits your workload?</strong></h2><p>Choosing the right inference architecture isn't always straightforward. Use this Q&amp;A-style checklist to guide your decision-making clearly and effectively:</p><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/IjrFd/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/37dcf61a-81f6-42d4-9607-9862704204ab_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:567,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/IjrFd/1/" width="730" height="567" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><h1><strong>Rollout Strategies: Safely Deploying ML Models in Production</strong></h1><p>In high-stakes domains like finance and aviation, a &#8220;big-bang&#8221; model swap (where the old system is shut down and instantly replaced by the new model) is a gamble no one can afford. Modern teams now favour progressive delivery, where new models are rolled out to a small slice of traffic or a safe environment first, validating performance in the wild, then scaling up only when metrics look solid. Let's explore progressive rollout strategies that minimise risk, provide real-world validation, and ensure stability.</p><h2><strong>Champion/Challenger</strong></h2><p>In this strategy, you deploy the new model ("challenger") in parallel to the existing production model ("champion"). Both receive the same inputs (mirrored traffic), but only the champion&#8217;s predictions directly impact users or operations. The challenger operates in the background, and its outputs are evaluated separately to determine if they outperform the existing model and for other experimental analyses. This is also known as shadow deployment, as the challenger models run in shadow mode (without affecting real outcomes)</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!mqaV!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa5791466-4ef2-4365-89ed-e766ef30c7e7_1546x640.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!mqaV!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa5791466-4ef2-4365-89ed-e766ef30c7e7_1546x640.png 424w, https://substackcdn.com/image/fetch/$s_!mqaV!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa5791466-4ef2-4365-89ed-e766ef30c7e7_1546x640.png 848w, https://substackcdn.com/image/fetch/$s_!mqaV!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa5791466-4ef2-4365-89ed-e766ef30c7e7_1546x640.png 1272w, https://substackcdn.com/image/fetch/$s_!mqaV!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa5791466-4ef2-4365-89ed-e766ef30c7e7_1546x640.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!mqaV!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa5791466-4ef2-4365-89ed-e766ef30c7e7_1546x640.png" width="1456" height="603" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/a5791466-4ef2-4365-89ed-e766ef30c7e7_1546x640.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:603,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!mqaV!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa5791466-4ef2-4365-89ed-e766ef30c7e7_1546x640.png 424w, https://substackcdn.com/image/fetch/$s_!mqaV!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa5791466-4ef2-4365-89ed-e766ef30c7e7_1546x640.png 848w, https://substackcdn.com/image/fetch/$s_!mqaV!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa5791466-4ef2-4365-89ed-e766ef30c7e7_1546x640.png 1272w, https://substackcdn.com/image/fetch/$s_!mqaV!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa5791466-4ef2-4365-89ed-e766ef30c7e7_1546x640.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h3><strong>Why choose Champion/Challenger?</strong></h3><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/idHhq/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/5fbc52ed-d156-477b-8a4c-5d179b822b41_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:219,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/idHhq/1/" width="730" height="219" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/idHhq/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/bd3b18a3-451d-49a1-bff0-97164efbe4a3_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:219,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/idHhq/1/" width="730" height="219" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><p>This approach is excellent when regulations require proof of model efficacy or when the cost of a bad decision is extremely high. With proper performance and governance analysis over time, decisions on when a challenger is ready to take over as the new champion can be made.</p><h4><strong>Flask-like pseudocode</strong></h4><pre><code>@app.route("/predict")
def predict():
    features = request.get_json()['features']
    
    # Champion makes the actual prediction
    champion_prediction = model_champion.predict(features)
    
    # Challenger prediction is computed but not returned to users
    challenger_prediction = model_challenger.predict(features)
    
    # Log challengr results separately for later comparison
    log_to_evaluation_db(features, champion_prediction, challenger_prediction)
    
    return champion_prediction</code></pre><h2><strong>Canary deployment</strong></h2><p>This deployment strategy incrementally deploys the new model to a small fraction of live traffic. The fraction gradually increases this percentage as confidence builds, continuously monitoring performance and stability.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!5mFx!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc75ca3bd-6f97-44c1-b4b3-849e567cca74_1575x1182.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!5mFx!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc75ca3bd-6f97-44c1-b4b3-849e567cca74_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!5mFx!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc75ca3bd-6f97-44c1-b4b3-849e567cca74_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!5mFx!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc75ca3bd-6f97-44c1-b4b3-849e567cca74_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!5mFx!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc75ca3bd-6f97-44c1-b4b3-849e567cca74_1575x1182.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!5mFx!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc75ca3bd-6f97-44c1-b4b3-849e567cca74_1575x1182.png" width="1456" height="1093" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/c75ca3bd-6f97-44c1-b4b3-849e567cca74_1575x1182.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1093,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!5mFx!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc75ca3bd-6f97-44c1-b4b3-849e567cca74_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!5mFx!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc75ca3bd-6f97-44c1-b4b3-849e567cca74_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!5mFx!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc75ca3bd-6f97-44c1-b4b3-849e567cca74_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!5mFx!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc75ca3bd-6f97-44c1-b4b3-849e567cca74_1575x1182.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h3><strong>Why choose canary deployment?</strong></h3><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/csZ5H/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/f0476986-fbf5-449b-bb21-7bd31bc7b27c_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:203,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/csZ5H/1/" width="730" height="203" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><p>Canary deployments are widely used when you want a cautious, data-driven rollout using real traffic. For example, companies like Tesla and Waymo use such rollouts, as they might enable a new vision model for a small set of cars, then expand if it performs well. The canary approach strikes a balance between innovation and caution, swiftly retracting any underperforming model before it causes widespread harm.</p><h4><strong>Flask-like pseudocode</strong></h4><pre><code>CANARY_PERCENTAGE = 0.05  # Start with 5%

@app.route("/predict")
def predict():
    features = request.get_json()['features']
    
    # Route a small fraction of users randomly to the canary (new) model
    if random.random() &lt; CANARY_PERCENTAGE:
        prediction = model_new.predict(features)
        model_version = "canary"
    else:
        prediction = model_current.predict(features)
        model_version = "current"
    
    log_prediction(features, prediction, model_version)
    
    return prediction</code></pre><h2><strong>A/B Testing</strong></h2><p>In A/B testing, you split live traffic between your existing model (Model A) and a new model (Model B), assigning each to handle a different subset of users or instances. Both models are statistically compared on key performance indicators (KPIs) to determine which model truly performs better. A/B testing starts with the null hypothesis that both models perform identically. Send live traffic to each, track a key metric, and calculate a p-value for the chance that random noise explains the observed gap. If the p-value is below your threshold (e.g., 0.05), the lift is statistically significant, so safely increase Model B&#8217;s traffic or promote it to production. Otherwise, keep Model A (or keep testing).</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!Am5q!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F88794692-1d40-463f-bae5-9480d27cf33c_1525x720.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!Am5q!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F88794692-1d40-463f-bae5-9480d27cf33c_1525x720.png 424w, https://substackcdn.com/image/fetch/$s_!Am5q!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F88794692-1d40-463f-bae5-9480d27cf33c_1525x720.png 848w, https://substackcdn.com/image/fetch/$s_!Am5q!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F88794692-1d40-463f-bae5-9480d27cf33c_1525x720.png 1272w, https://substackcdn.com/image/fetch/$s_!Am5q!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F88794692-1d40-463f-bae5-9480d27cf33c_1525x720.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!Am5q!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F88794692-1d40-463f-bae5-9480d27cf33c_1525x720.png" width="1456" height="687" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/88794692-1d40-463f-bae5-9480d27cf33c_1525x720.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:687,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!Am5q!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F88794692-1d40-463f-bae5-9480d27cf33c_1525x720.png 424w, https://substackcdn.com/image/fetch/$s_!Am5q!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F88794692-1d40-463f-bae5-9480d27cf33c_1525x720.png 848w, https://substackcdn.com/image/fetch/$s_!Am5q!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F88794692-1d40-463f-bae5-9480d27cf33c_1525x720.png 1272w, https://substackcdn.com/image/fetch/$s_!Am5q!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F88794692-1d40-463f-bae5-9480d27cf33c_1525x720.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h3><strong>Why choose A/B testing?</strong></h3><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/4TSwQ/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/553684ad-216e-4b42-a85f-4f180fbb86ac_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:290,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/4TSwQ/1/" width="730" height="290" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><p>It is primarily utilised in situations where it is ethically and operationally possible to treat different users differently for a brief period. For example, a bank&#8203;&#8203; might route a small percentage of transactions through a new fraud model to see if it catches more fraud or flags fewer false positives than the old model.</p><h4><strong>Flask-like pseudocode</strong></h4><pre><code>TREATMENT_GROUP = set(load_user_ids_for_treatment())  # e.g., 10% users

@app.route("/predict")
def predict():
    data = request.get_json()
    features = data['features']
    user_id = data['user_id']

    # Determine if user is in treatment (B) or control (A) group
    if user_id in TREATMENT_GROUP:
        prediction = model_B.predict(features)
        variant = "B"
    else:
        prediction = model_A.predict(features)
        variant = "A"


    # Log results for offline statistically analysis
    log_ab_test_results(user_id, features, prediction, variant)
    
    return prediction</code></pre><h1><strong>Scaling Your Deployment: Serving Hundreds or Thousands of ML Models</strong></h1><p>Once a single ML model delivers measurable business or safety improvements, rapid scaling becomes inevitable. Soon, you're managing not just a handful but hundreds or thousands of models, each bringing unique operational headaches, such as management complexity, resource contention, and cost efficiency. Leading enterprises, such as Netflix, have addressed these challenges using a proven framework: <strong>standardise, containerise, and orchestrate</strong>. Explore deeper insights:</p><ul><li><p><a href="https://netflixtechblog.com/supporting-diverse-ml-systems-at-netflix-2d2e6b6d205d">Netflix Techblog: Supporting Diverse ML Systems at Netflix</a></p></li><li><p><a href="https://www.youtube.com/watch?v=5WPrZgUu8u0">DataDog: Scaling ML Serving to Thousands of Models</a></p></li><li><p><a href="https://aws.amazon.com/blogs/machine-learning/best-practices-for-serving-hundreds-to-thousands-of-models/">Industry Best Practices: Serving Hundreds to Thousands of ML Models</a></p></li><li><p><a href="https://towardsdatascience.com/serve-hundreds-to-thousands-of-ml-models-architectures-from-industry-bf3d9474d427/">TDS: Serve hundreds to thousands of ML models &#8211; architectures from industry</a></p></li></ul><h1><strong>Actionable Takeaways</strong></h1><ol><li><p>Pin down your performance requirements, resources, cost and scalability constraints, data dynamics and interactions early; these shape your deployment choices and downstream decisions.</p></li><li><p>Map your use case to the right pattern: real-time for instant results, async for flexible workloads, or batch for large-scale efficiency.</p></li><li><p><strong>Rollout models progressively</strong></p></li></ol><ul><li><p><strong>Champion/Challenger</strong>: Safely validate a new model alongside the existing one without disruption.</p></li><li><p><strong>Canary Deployments</strong>: Incrementally introduce new models, catching issues early and safely.</p></li><li><p><strong>A/B Testing</strong>: Statistically validate model improvements on live traffic before full-scale deployment.</p></li></ul><ol start="4"><li><p>Standardise your configurations, containerise your models, and orchestrate your ML deployments. It&#8217;ll simplify complexity, reduce operational headaches, and make scaling frictionless.</p></li></ol><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient X Design! Subscribe for free to receive new posts</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><p></p><div><hr></div><h2>Table of Contents</h2><ul><li><p><a href="https://resilient.safeintelligence.ai/i/170338897/tldr">TL;DR</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/170338897/understanding-your-deployment-needs">Understanding your deployment needs</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/170338897/core-ml-serving-architectures">Core ML Serving Architectures</a></p><ul><li><p><a href="https://resilient.safeintelligence.ai/i/170338897/online-real-time-inference-synchronous">Online Real-Time Inference (Synchronous)</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/170338897/asynchronous-inference-near-real-time">Asynchronous Inference (Near Real-Time)</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/170338897/offline-batch-prediction-batch-transform">Offline Batch Prediction (Batch Transform)</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/170338897/which-architecture-fits-your-workload">Which architecture fits your workload?</a></p></li></ul></li><li><p><a href="https://resilient.safeintelligence.ai/i/170338897/rollout-strategies-safely-deploying-ml-models-in-production">Rollout Strategies: Safely Deploying ML Models in Production</a></p><ul><li><p><a href="http://Champion/Challenger">Champion/Challenger</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/170338897/canary-deployment">Canary deployment</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/170338897/ab-testing">A/B Testing</a></p></li></ul></li><li><p><a href="https://resilient.safeintelligence.ai/i/170338897/scaling-your-deployment-serving-hundreds-or-thousands-of-ml-models">Scaling Your Deployment: Serving Hundreds or Thousands of ML Models</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/170338897/actionable-takeaways">Actionable Takeaways</a></p></li></ul><div><hr></div><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/deploying-machine-learning-models/comments&quot;,&quot;text&quot;:&quot;Leave a comment&quot;,&quot;action&quot;:null,&quot;class&quot;:null}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/deploying-machine-learning-models/comments"><span>Leave a comment</span></a></p><p></p>]]></content:encoded></item><item><title><![CDATA[Formal Verification Of ML]]></title><description><![CDATA[Towards Provable Guarantees]]></description><link>https://resilient.safeintelligence.ai/p/formal-verification-of-ml</link><guid isPermaLink="false">https://resilient.safeintelligence.ai/p/formal-verification-of-ml</guid><dc:creator><![CDATA[Brain Aboze]]></dc:creator><pubDate>Fri, 01 Aug 2025 09:05:44 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!cNkl!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F950aeb14-9978-4e3d-b01f-5acc89515504_1456x816.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<h1><strong>Table of Contents</strong></h1><ul><li><p><a href="https://resilient.safeintelligence.ai/i/169439154/tldr">TL;DR</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/169439154/what-do-we-want-to-verify">What do we want to verify?</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/169439154/verification-techniques">Verification Techniques</a></p><ul><li><p><a href="https://resilient.safeintelligence.ai/i/169439154/constraint-based-verification">Constraint-Based Verification</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/169439154/abstraction-based-verification">Abstraction-Based Verification</a></p></li></ul></li><li><p><a href="https://resilient.safeintelligence.ai/i/169439154/comparison-of-verification-techniques">Comparison of Verification Techniques</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/169439154/the-elephant-in-the-room-does-it-scale">The Elephant in the Room: Does it Scale?</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/169439154/putting-formal-verification-to-work">Putting Formal Verification to Work</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/169439154/further-reading">Further Reading</a></p></li></ul><h1><strong>TL;DR</strong></h1><blockquote><p><em>In the previous articles, we explored identifying the best-suited model for your use case through testing and benchmarking. But ask yourself: does testing and benchmarking give you the confidence to deploy these models in critical scenarios where a mistake is consequential? Do those 10,000 test cases provide you with everything you need? Perhaps you&#8217;re considering the scope and coverage of the testing and benchmarking or the inherent uncertainty that these models will face in their environment. This is precisely where formal verification moves beyond testing. It allows for a deeper analysis of the model itself, systematically searching for the fragilities that lead to failure. By generating concrete counterexamples, it proves where the model breaks, giving you the power to build a truly robust system before it's ever deployed.</em></p></blockquote><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!cNkl!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F950aeb14-9978-4e3d-b01f-5acc89515504_1456x816.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!cNkl!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F950aeb14-9978-4e3d-b01f-5acc89515504_1456x816.png 424w, https://substackcdn.com/image/fetch/$s_!cNkl!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F950aeb14-9978-4e3d-b01f-5acc89515504_1456x816.png 848w, https://substackcdn.com/image/fetch/$s_!cNkl!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F950aeb14-9978-4e3d-b01f-5acc89515504_1456x816.png 1272w, https://substackcdn.com/image/fetch/$s_!cNkl!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F950aeb14-9978-4e3d-b01f-5acc89515504_1456x816.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!cNkl!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F950aeb14-9978-4e3d-b01f-5acc89515504_1456x816.png" width="1456" height="816" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/950aeb14-9978-4e3d-b01f-5acc89515504_1456x816.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:816,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!cNkl!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F950aeb14-9978-4e3d-b01f-5acc89515504_1456x816.png 424w, https://substackcdn.com/image/fetch/$s_!cNkl!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F950aeb14-9978-4e3d-b01f-5acc89515504_1456x816.png 848w, https://substackcdn.com/image/fetch/$s_!cNkl!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F950aeb14-9978-4e3d-b01f-5acc89515504_1456x816.png 1272w, https://substackcdn.com/image/fetch/$s_!cNkl!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F950aeb14-9978-4e3d-b01f-5acc89515504_1456x816.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Formal verification is not a new concept; it has long been integral to ensuring reliability in hardware and, to a lesser extent, software. According to Wikipedia, <em>&#8220;formal verification is the act of <strong>proving</strong> or <strong>disproving</strong> the <strong>correctness</strong> of a <strong>system</strong> with respect to a certain <strong>formal specification</strong> or property.&#8221; </em>Immediately, four essential components emerge clearly from this definition<em>:</em></p><ol><li><p><strong>System under verification</strong>: The specific hardware or software whose behaviour requires to be rigorously checked.</p></li><li><p><strong>Formal specification</strong>: Precisely defined requirements or properties the system must satisfy, expressed mathematically rather than informal descriptions.</p></li><li><p><strong>Correctness criterion</strong>: Conditions that confirm the system meets the specification or identify cases where it fails.</p></li><li><p><strong>Formal methods</strong>: Mathematical techniques and proof engines employed to rigorously prove or disprove whether the system satisfies the given specification.</p></li></ol><p>With these four components in mind, think of formal verification as moving beyond conventional testing by employing rigorous mathematical (<strong>formal</strong>) methods to provide stronger guarantees, which is particularly crucial when even small mistakes can lead to catastrophic outcomes, like in use cases like autonomous driving, medical diagnostics, security surveillance, malware detection, or high-stakes finance.</p><p>Formal verification has a long history in hardware dating back to 1984 with tools like<a href="https://en.wikipedia.org/wiki/Verilog"> Verilog</a> and now<a href="https://en.wikipedia.org/wiki/SystemVerilog"> SystemVerilog</a>, tools standardised by IEEE and widely adopted for specifications for designing and verifying hardware. Software adoption lagged, largely due to <a href="https://pzuliani.github.io/papers/LASER2011-Model-Checking.pdf">state&#8209;space explosion</a> (as the number of state variables in the system increases, the size of the system state space grows exponentially). When dealing with software in aircraft or nuclear reactors, it's not enough to merely test a handful of scenarios. You need more guarantees that the software behaves correctly under every conceivable situation. Formal verification isn't merely theoretical but can have a practical, real-world impact. For example, it was successfully employed in the design of <a href="https://en.wikipedia.org/wiki/Paris_M%C3%A9tro_Line_14#:~:text=Some%20features%20of%20Line%2014's,%2DMethod%2C%20a%20formal%20method.">Paris&#8217;s autonomous M&#233;tro Line 14</a>.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!4SFU!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8064376d-34c1-44b1-ab08-2079f4c0f6f7_1600x1066.jpeg" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!4SFU!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8064376d-34c1-44b1-ab08-2079f4c0f6f7_1600x1066.jpeg 424w, https://substackcdn.com/image/fetch/$s_!4SFU!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8064376d-34c1-44b1-ab08-2079f4c0f6f7_1600x1066.jpeg 848w, https://substackcdn.com/image/fetch/$s_!4SFU!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8064376d-34c1-44b1-ab08-2079f4c0f6f7_1600x1066.jpeg 1272w, https://substackcdn.com/image/fetch/$s_!4SFU!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8064376d-34c1-44b1-ab08-2079f4c0f6f7_1600x1066.jpeg 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!4SFU!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8064376d-34c1-44b1-ab08-2079f4c0f6f7_1600x1066.jpeg" width="1456" height="970" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/8064376d-34c1-44b1-ab08-2079f4c0f6f7_1600x1066.jpeg&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:970,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!4SFU!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8064376d-34c1-44b1-ab08-2079f4c0f6f7_1600x1066.jpeg 424w, https://substackcdn.com/image/fetch/$s_!4SFU!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8064376d-34c1-44b1-ab08-2079f4c0f6f7_1600x1066.jpeg 848w, https://substackcdn.com/image/fetch/$s_!4SFU!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8064376d-34c1-44b1-ab08-2079f4c0f6f7_1600x1066.jpeg 1272w, https://substackcdn.com/image/fetch/$s_!4SFU!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F8064376d-34c1-44b1-ab08-2079f4c0f6f7_1600x1066.jpeg 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Paris Metro Station, Source:<a href="https://www.pexels.com/photo/paris-metro-station-with-passengers-and-train-29731398/"> Art&#363;ras Kokorevas</a></figcaption></figure></div><p>But how does this relate specifically to AI systems? The well-known AI researcher Andrej Karpathy highlights software evolution from Software 1.0 (manually written software) to Software 2.0 (software learning from data, ML models) and, more recently, Software 3.0 (AI-driven development).</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!fIsU!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3b363fe8-ab7a-4581-a222-0f4add13b217_1600x784.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!fIsU!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3b363fe8-ab7a-4581-a222-0f4add13b217_1600x784.png 424w, https://substackcdn.com/image/fetch/$s_!fIsU!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3b363fe8-ab7a-4581-a222-0f4add13b217_1600x784.png 848w, https://substackcdn.com/image/fetch/$s_!fIsU!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3b363fe8-ab7a-4581-a222-0f4add13b217_1600x784.png 1272w, https://substackcdn.com/image/fetch/$s_!fIsU!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3b363fe8-ab7a-4581-a222-0f4add13b217_1600x784.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!fIsU!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3b363fe8-ab7a-4581-a222-0f4add13b217_1600x784.png" width="1456" height="713" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/3b363fe8-ab7a-4581-a222-0f4add13b217_1600x784.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:713,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!fIsU!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3b363fe8-ab7a-4581-a222-0f4add13b217_1600x784.png 424w, https://substackcdn.com/image/fetch/$s_!fIsU!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3b363fe8-ab7a-4581-a222-0f4add13b217_1600x784.png 848w, https://substackcdn.com/image/fetch/$s_!fIsU!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3b363fe8-ab7a-4581-a222-0f4add13b217_1600x784.png 1272w, https://substackcdn.com/image/fetch/$s_!fIsU!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3b363fe8-ab7a-4581-a222-0f4add13b217_1600x784.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Source: Y Combinator, <a href="https://www.youtube.com/watch?v=LCEmiRjPEtQ">Andrej Karpathy: Software Is Changing (Again)</a></figcaption></figure></div><p>ML models, the core of Software 2.0, are now central to critical decision-making. However, their reliability is a key concern; while powerful, they are often fragile, failing in response to subtle data shifts that humans would not notice. This brittleness has serious consequences. For instance, minor data variations could lead to flawed loan approvals, while small visual distortions could cause an autonomous system to catastrophically misclassify its environment. Similarly, a critical customer service message could be misinterpreted by an NLP model, leading to significant service failures.</p><h1><strong>What do we want to verify?</strong></h1><p>These vulnerabilities (adversarial or otherwise) underscore the critical point that performance metrics and benchmarks alone aren't sufficient. We must rigorously verify specific model behaviours to eliminate these fragilities and blind spots. When failures can cost lives, livelihoods, or vast sums of money, &#8220;<em>correctness</em>&#8221; must be stated in precise, verifiable terms. The properties most often formalised in such critical systems include, but are not limited to, the following:</p><ul><li><p><strong>Robustness</strong>: Does the model maintain stable outputs despite minor perturbations in inputs?</p></li><li><p><strong>Safety</strong>: Can we confidently say the model avoids catastrophic decisions under all possible conditions?</p></li><li><p><strong>Fairness</strong>: Could subtle biases or irrelevant data variations lead to unfair outcomes?</p></li><li><p><strong>Consistency:</strong> The model must uphold fundamental domain rules, ensuring its outputs never violate basic logical or physical constraints across all valid inputs.</p></li></ul><p>These are the types of behaviours formal verification methods are designed to assess, moving beyond intuition, testing and benchmarking. A quick overview of the verification process is illustrated below:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!GUnm!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4a1a3fd8-a008-4b06-855e-13badf8a5549_1575x788.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!GUnm!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4a1a3fd8-a008-4b06-855e-13badf8a5549_1575x788.png 424w, https://substackcdn.com/image/fetch/$s_!GUnm!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4a1a3fd8-a008-4b06-855e-13badf8a5549_1575x788.png 848w, https://substackcdn.com/image/fetch/$s_!GUnm!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4a1a3fd8-a008-4b06-855e-13badf8a5549_1575x788.png 1272w, https://substackcdn.com/image/fetch/$s_!GUnm!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4a1a3fd8-a008-4b06-855e-13badf8a5549_1575x788.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!GUnm!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4a1a3fd8-a008-4b06-855e-13badf8a5549_1575x788.png" width="1456" height="728" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/4a1a3fd8-a008-4b06-855e-13badf8a5549_1575x788.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:728,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!GUnm!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4a1a3fd8-a008-4b06-855e-13badf8a5549_1575x788.png 424w, https://substackcdn.com/image/fetch/$s_!GUnm!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4a1a3fd8-a008-4b06-855e-13badf8a5549_1575x788.png 848w, https://substackcdn.com/image/fetch/$s_!GUnm!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4a1a3fd8-a008-4b06-855e-13badf8a5549_1575x788.png 1272w, https://substackcdn.com/image/fetch/$s_!GUnm!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4a1a3fd8-a008-4b06-855e-13badf8a5549_1575x788.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Formal verification of ML</figcaption></figure></div><h1><strong>Verification Techniques</strong></h1><p>We&#8217;ve identified the critical properties, such as robustness, safety, fairness, and consistency, that we want our ML models to exhibit. Now, the next question is, <em><strong>how do we formulate these properties into a &#8216;verification problem&#8217; that formal methods can solve? </strong></em>This involves translating the desired model behaviours into precise, mathematically checkable statements and then verifying whether the model satisfies these statements universally.</p><p>Formally speaking (no pun intended), the verification problem asks:</p><blockquote><p><em>Given an ML model and a formal specification (a property like safety or robustness expressed in mathematics), does the ML model satisfy the formal specification for all admissible inputs and behaviours?</em></p></blockquote><p>To solve this verification problem, we generally rely on two main techniques: <strong>constraint-based verification</strong> and <strong>abstraction-based verification.</strong></p><h2>Constraint-Based Verification</h2><p>Constraint-based verification converts the verification problem, the input domain, model structure, and property to be checked into a single mathematical system of logical/optimisation constraints, typically fed to a <a href="https://people.eecs.berkeley.edu/~sseshia/pubdir/SMT-BookChapter.pdf">Satisfiability Modulo Theories (SMT)</a> or<a href="https://www.mathworks.com/help/optim/ug/mixed-integer-linear-programming-algorithms.html"> Mixed-Integer Linear Programming (MILP) engine</a>. The solver treats this system of constraints as a single, complex puzzle. It doesn't check inputs one-by-one; instead, it uses powerful symbolic reasoning and logical deduction to intelligently prune away vast regions of the search space that cannot possibly contain a violation. Its goal is to find one concrete counterexample that satisfies every constraint at once.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!m_V3!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff1f1ea93-55f7-4833-bb08-03da6f2e0bdd_1575x1382.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!m_V3!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff1f1ea93-55f7-4833-bb08-03da6f2e0bdd_1575x1382.png 424w, https://substackcdn.com/image/fetch/$s_!m_V3!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff1f1ea93-55f7-4833-bb08-03da6f2e0bdd_1575x1382.png 848w, https://substackcdn.com/image/fetch/$s_!m_V3!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff1f1ea93-55f7-4833-bb08-03da6f2e0bdd_1575x1382.png 1272w, https://substackcdn.com/image/fetch/$s_!m_V3!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff1f1ea93-55f7-4833-bb08-03da6f2e0bdd_1575x1382.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!m_V3!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff1f1ea93-55f7-4833-bb08-03da6f2e0bdd_1575x1382.png" width="1456" height="1278" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/f1f1ea93-55f7-4833-bb08-03da6f2e0bdd_1575x1382.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1278,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:77072,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/169439154?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff1f1ea93-55f7-4833-bb08-03da6f2e0bdd_1575x1382.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!m_V3!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff1f1ea93-55f7-4833-bb08-03da6f2e0bdd_1575x1382.png 424w, https://substackcdn.com/image/fetch/$s_!m_V3!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff1f1ea93-55f7-4833-bb08-03da6f2e0bdd_1575x1382.png 848w, https://substackcdn.com/image/fetch/$s_!m_V3!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff1f1ea93-55f7-4833-bb08-03da6f2e0bdd_1575x1382.png 1272w, https://substackcdn.com/image/fetch/$s_!m_V3!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff1f1ea93-55f7-4833-bb08-03da6f2e0bdd_1575x1382.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Because this search explores all feasible sub-regions, the approach is <strong>sound</strong> (it never certifies a false property) and <strong>complete</strong> (it is guaranteed to find a counter-example if one exists). Given enough time and memory, the solver must terminate with either a proof that no violation exists or with a concrete counter-example that satisfies every constraint and therefore constitutes a genuine failure of the property. The price for such exhaustive coverage is the computational cost, which is prohibitively expensive.</p><h2>Abstraction-Based Verification</h2><p>Abstraction-based verification trades theoretical completeness for speed and scalability. In other words, it is <strong>sound, </strong>but it is <strong>incomplete</strong> (it may stop with &#8220;unknown&#8221; even though the property is either true or false), especially if it runs out of time or memory. Unlike constraint-based verification, which pushes the exact verification problem into a heavyweight solver, abstraction-based verification takes a lighter, step-by-step route. It watches the input set flow through the model structure (such as layers in a neural net or splits in a decision tree), and after each layer or decision point, wraps all the possible outputs in one slightly oversized &#8220;bubble&#8221; (which can take different shapes, such as a box, polyhedron, or zonotope). This step-by-step process, also known as bound propagation, produces an over-approximation at every stage, ensuring every real behaviour is safely enclosed within these bubbles. If the final bubble never touches the danger zone, the model is provably safe; if it does touch, the method refines the bubbles or reports &#8220;unknown.&#8221;</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!Bbk0!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F18f5049a-a8c4-433e-bf18-9c6455615430_1575x1582.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!Bbk0!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F18f5049a-a8c4-433e-bf18-9c6455615430_1575x1582.png 424w, https://substackcdn.com/image/fetch/$s_!Bbk0!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F18f5049a-a8c4-433e-bf18-9c6455615430_1575x1582.png 848w, https://substackcdn.com/image/fetch/$s_!Bbk0!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F18f5049a-a8c4-433e-bf18-9c6455615430_1575x1582.png 1272w, https://substackcdn.com/image/fetch/$s_!Bbk0!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F18f5049a-a8c4-433e-bf18-9c6455615430_1575x1582.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!Bbk0!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F18f5049a-a8c4-433e-bf18-9c6455615430_1575x1582.png" width="1456" height="1462" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/18f5049a-a8c4-433e-bf18-9c6455615430_1575x1582.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1462,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:81255,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/169439154?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F18f5049a-a8c4-433e-bf18-9c6455615430_1575x1582.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!Bbk0!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F18f5049a-a8c4-433e-bf18-9c6455615430_1575x1582.png 424w, https://substackcdn.com/image/fetch/$s_!Bbk0!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F18f5049a-a8c4-433e-bf18-9c6455615430_1575x1582.png 848w, https://substackcdn.com/image/fetch/$s_!Bbk0!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F18f5049a-a8c4-433e-bf18-9c6455615430_1575x1582.png 1272w, https://substackcdn.com/image/fetch/$s_!Bbk0!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F18f5049a-a8c4-433e-bf18-9c6455615430_1575x1582.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>This "bubble" approach has two main advantages. First, it's significantly faster and more scalable than constraint-based verification, because following one bubble at each stage is much simpler than exhaustively checking millions of individual inputs. Second, it provides trustworthy guarantees: if the final bubble sits fully in the safe region, the model is proven safe. However, since each bubble is padded for safety (over-approximated), it can sometimes become too large and overlap the unsafe zone. When this happens, the method can't immediately tell if there's a real problem or just extra padding. At this point, abstraction-based methods usually employ lightweight solvers to quickly determine whether a genuine violation exists within the bubble. If no violation is found, the bubble is refined by splitting the input region further. If time or resources run out before a clear answer is obtained, the method returns "unknown."</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!rU5o!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F113beebe-7843-4dce-b1bd-d6e0087b03e8_1378x788.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!rU5o!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F113beebe-7843-4dce-b1bd-d6e0087b03e8_1378x788.png 424w, https://substackcdn.com/image/fetch/$s_!rU5o!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F113beebe-7843-4dce-b1bd-d6e0087b03e8_1378x788.png 848w, https://substackcdn.com/image/fetch/$s_!rU5o!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F113beebe-7843-4dce-b1bd-d6e0087b03e8_1378x788.png 1272w, https://substackcdn.com/image/fetch/$s_!rU5o!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F113beebe-7843-4dce-b1bd-d6e0087b03e8_1378x788.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!rU5o!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F113beebe-7843-4dce-b1bd-d6e0087b03e8_1378x788.png" width="1378" height="788" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/113beebe-7843-4dce-b1bd-d6e0087b03e8_1378x788.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:788,&quot;width&quot;:1378,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!rU5o!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F113beebe-7843-4dce-b1bd-d6e0087b03e8_1378x788.png 424w, https://substackcdn.com/image/fetch/$s_!rU5o!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F113beebe-7843-4dce-b1bd-d6e0087b03e8_1378x788.png 848w, https://substackcdn.com/image/fetch/$s_!rU5o!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F113beebe-7843-4dce-b1bd-d6e0087b03e8_1378x788.png 1272w, https://substackcdn.com/image/fetch/$s_!rU5o!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F113beebe-7843-4dce-b1bd-d6e0087b03e8_1378x788.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h1><strong>Comparison of Verification Techniques</strong></h1><blockquote><p>Constraint-based and abstraction-based methods answer the same fundamental question: <em>&#8220;<strong>Does my model always satisfy the given property?</strong>&#8221;</em> However, their mathematical approaches differ significantly. To illustrate this clearly, let's first consider a concise comparative overview:</p></blockquote><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/gUoiz/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/ec8e95f8-10bb-4f40-9ad3-8e18b78d827d_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:221,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/gUoiz/1/" width="730" height="221" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><p>To vividly demonstrate these differences, let's trace how each one tackles a simple verification problem. Consider this tiny neural network and a safety property:</p><pre><code>Layer 0 (input): x &#8712; [0,1]
Layer 1 (linear): z = 2x &#8722; 1
Layer 2 (ReLU): h = max(0, z)
Layer 3 (output):  y = &#8722;h + 0.8
Safety propery:  y &gt; 0   (for every x in [0,1])</code></pre><p>The property claims the network's output is always positive. Is this true? Let&#8217;s see how the two verification methods handle this.</p><p>For <strong>constraint-based verification</strong>, its steps include:</p><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/APO7h/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/eaea75c4-5a78-48db-a533-0dca73f9a6a9_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:460,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/APO7h/1/" width="730" height="460" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><p>Constraint-based methods exhaustively explore possibilities to find definitive answers but face scalability limitations with complex or large models.</p><p>For <strong>abstraction-based verification</strong>, its steps include:</p><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/1XsEV/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/7967b242-0f90-405c-8b41-ff9cf5200d3a_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:630,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/1XsEV/1/" width="730" height="630" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><p>The abstraction-based approach delivers speed and scalability by making safe estimates. While its initial findings can be imprecise, its ability to iteratively refine its focus makes it a powerful and practical tool for verifying large-scale systems.</p><h1>The Elephant in the Room: Does it Scale?</h1><p>The critical question always arises when discussing formal methods: <em><strong>Does it scale?</strong></em></p><p>Historically, these techniques have carried a reputation for being powerful but difficult to scale, often seen as best suited for academic exercises or smaller models. This is a valid concern based on the field's origins.</p><p>But that is no longer the full story. This exact scaling challenge is what our team at Safe Intelligence has been dedicated to solving. By developing novel approaches that blend mathematical rigor with computational efficiency, we are making scalable verification a reality for the complex, production-grade AI systems being deployed today.</p><h1>Putting Formal Verification to Work</h1><p>Applying these advanced verification techniques enhances the reliability and safety of ML models in critical applications <strong>now</strong>. As you embark on your AI assurance journey, these are the core principles to guide you:</p><ul><li><p><strong>Go Beyond Testing.</strong> Testing confirms behavior for inputs you've checked; formal verification proves correctness for entire classes of inputs you haven't, providing a far stronger guarantee of safety.</p></li><li><p><strong>Precisely Define Correctness.</strong> The foundation of meaningful verification is translating business needs into mathematical properties. Whether it's robustness, fairness, or consistency, a clear specification is essential.</p></li><li><p><strong>Verify Systems, Not Just Models.</strong> <strong>Compositional verification</strong> analyzes the complete system (the training data, the software environment, and its deployment dynamics) to build genuinely trustworthy AI.</p></li></ul><div class="pullquote"><p>The era of treating ML verification as a purely academic pursuit is over. These techniques have matured and are already being applied to solve real-world safety and reliability challenges at scale. This isn't a theoretical future, <strong>it's happening now.</strong></p></div><p>In our upcoming articles, we&#8217;ll explore real-world case studies and practical applications that are helping make AI safer and more trustworthy across industries. But first, in the next article, we&#8217;ll show how we deploy these formally validated models in real systems. Stay tuned and stay safe. </p><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient X Design! Subscribe for free to receive new posts.</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><h1><strong>Further Reading</strong></h1><ul><li><p><a href="https://arxiv.org/pdf/2104.02466">A Review of Formal Methods applied to Machine Learning</a></p></li><li><p><a href="https://arxiv.org/pdf/2109.10317">Introduction to Neural Network Verification</a></p></li><li><p><a href="https://www.nowpublishers.com/article/Details/OPT-035">Algorithms for Verifying Deep Neural Networks</a></p></li><li><p><a href="https://arxiv.org/abs/2306.10426">Understanding Certified Training with Interval Bound Propagation</a></p></li><li><p><a href="https://web.stanford.edu/class/cs357/cegar.pdf">Counter&#8209;Example&#8209;Guided Abstraction Refinement (CEGAR)</a></p></li><li><p><a href="https://arxiv.org/html/2411.04594v2">Verification of Neural Networks Against Convolutional Perturbations via Parameterised Kernels</a></p></li><li><p><a href="https://arxiv.org/pdf/2408.03488">Recomposition: A New Technique for Efficient Compositional Verification</a></p></li></ul><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/formal-verification-of-ml/comments&quot;,&quot;text&quot;:&quot;Leave a comment&quot;,&quot;action&quot;:null,&quot;class&quot;:null}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/formal-verification-of-ml/comments"><span>Leave a comment</span></a></p><p></p>]]></content:encoded></item><item><title><![CDATA[ML Benchmarking Primer ]]></title><description><![CDATA[Measure, Compare, and Improve Your ML Systems]]></description><link>https://resilient.safeintelligence.ai/p/ml-benchmarking-primer</link><guid isPermaLink="false">https://resilient.safeintelligence.ai/p/ml-benchmarking-primer</guid><dc:creator><![CDATA[Brain Aboze]]></dc:creator><pubDate>Fri, 25 Jul 2025 07:00:59 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!CdLZ!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F11e608a2-1895-450f-b5e5-b48c4bc095d9_1456x816.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<h1><strong>Table of Contents</strong></h1><ul><li><p><a href="https://resilient.safeintelligence.ai/i/169188633/tldr">TL;DR</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/169188633/why-benchmarking-matters">Why Benchmarking Matters</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/169188633/what-counts-as-a-benchmark">What Counts as a "Benchmark"?</a></p><ul><li><p><a href="https://resilient.safeintelligence.ai/i/169188633/infrastructuresystem-benchmarks">Infrastructure/System Benchmarks</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/169188633/model-benchmarks">Model Benchmarks</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/169188633/data-benchmarks">Data Benchmarks</a></p></li></ul></li><li><p><a href="https://resilient.safeintelligence.ai/i/169188633/building-or-choosing-your-benchmarks">Building or Choosing Your Benchmarks</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/169188633/key-takeaways">Key Takeaways</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/169188633/further-reading">Further Reading</a></p></li></ul><h1><strong>TL;DR</strong></h1><blockquote><p><em>In the previous article of this series, we explored how to design effective test sets to get the best out of our validation exercise. But once you've nailed validation, the next big question is: How well does your model perform compared to other approaches or experiments? That's exactly where machine learning benchmarking comes into play. So, is this a feel-good exercise? Does it making you feel inadequate, or is it a useful tool? We will cover how to think of ML benchmarking as layers, cover choices between public and private benchmarking, and present a clear systematic workflow to connect technical model performance directly to measurable business impact. Without benchmarks, every experiment run thinks it&#8217;s the valedictorian.</em></p></blockquote><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!CdLZ!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F11e608a2-1895-450f-b5e5-b48c4bc095d9_1456x816.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!CdLZ!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F11e608a2-1895-450f-b5e5-b48c4bc095d9_1456x816.png 424w, https://substackcdn.com/image/fetch/$s_!CdLZ!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F11e608a2-1895-450f-b5e5-b48c4bc095d9_1456x816.png 848w, https://substackcdn.com/image/fetch/$s_!CdLZ!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F11e608a2-1895-450f-b5e5-b48c4bc095d9_1456x816.png 1272w, https://substackcdn.com/image/fetch/$s_!CdLZ!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F11e608a2-1895-450f-b5e5-b48c4bc095d9_1456x816.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!CdLZ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F11e608a2-1895-450f-b5e5-b48c4bc095d9_1456x816.png" width="1456" height="816" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/11e608a2-1895-450f-b5e5-b48c4bc095d9_1456x816.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:816,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!CdLZ!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F11e608a2-1895-450f-b5e5-b48c4bc095d9_1456x816.png 424w, https://substackcdn.com/image/fetch/$s_!CdLZ!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F11e608a2-1895-450f-b5e5-b48c4bc095d9_1456x816.png 848w, https://substackcdn.com/image/fetch/$s_!CdLZ!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F11e608a2-1895-450f-b5e5-b48c4bc095d9_1456x816.png 1272w, https://substackcdn.com/image/fetch/$s_!CdLZ!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F11e608a2-1895-450f-b5e5-b48c4bc095d9_1456x816.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>On October 14, 1947, test pilot Chuck Yeager pushed his plane through the &#8220;sound barrier&#8221;, a limit many believed was fatal. His flight didn&#8217;t just produce a sonic boom; it shattered the assumptions of an entire field, proving that what seemed impossible was merely the next frontier. In computing, benchmarks are our sound barriers. They are the standard measures that motivate change, drive breakthroughs, and push researchers to exceed perceived limits. While your next developed model might not produce a sonic boom, it faces its own set of critical barriers. (Also, if you&#8217;re working on breaking the lightspeed barrier, let us know we&#8217;d like to invest!)</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!nnAW!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F614feb1d-f348-4eac-bbda-4711648247a7_1600x1045.jpeg" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!nnAW!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F614feb1d-f348-4eac-bbda-4711648247a7_1600x1045.jpeg 424w, https://substackcdn.com/image/fetch/$s_!nnAW!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F614feb1d-f348-4eac-bbda-4711648247a7_1600x1045.jpeg 848w, https://substackcdn.com/image/fetch/$s_!nnAW!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F614feb1d-f348-4eac-bbda-4711648247a7_1600x1045.jpeg 1272w, https://substackcdn.com/image/fetch/$s_!nnAW!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F614feb1d-f348-4eac-bbda-4711648247a7_1600x1045.jpeg 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!nnAW!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F614feb1d-f348-4eac-bbda-4711648247a7_1600x1045.jpeg" width="1456" height="951" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/614feb1d-f348-4eac-bbda-4711648247a7_1600x1045.jpeg&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:951,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!nnAW!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F614feb1d-f348-4eac-bbda-4711648247a7_1600x1045.jpeg 424w, https://substackcdn.com/image/fetch/$s_!nnAW!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F614feb1d-f348-4eac-bbda-4711648247a7_1600x1045.jpeg 848w, https://substackcdn.com/image/fetch/$s_!nnAW!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F614feb1d-f348-4eac-bbda-4711648247a7_1600x1045.jpeg 1272w, https://substackcdn.com/image/fetch/$s_!nnAW!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F614feb1d-f348-4eac-bbda-4711648247a7_1600x1045.jpeg 1456w" sizes="100vw"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Release of Bell X-1 before ignition, <a href="https://www.celestis.com/blog/celestis-and-the-bell-x-1/">Celestis</a></figcaption></figure></div><p>The previous article in the series gave us a foundation for model evaluation by designing test sets. Tests by themselves though are not anchored to anything. Benchmarking builds on tests by comparing multiple viable models. This comparison helps identify the best balance of performance, cost, latency, and risk for a specific business objective. Benchmarking can take time and be laborious but there are a lot of benefits. Let&#8217;s get started&#8230;</p><h1>Why Benchmarking Matters</h1><p>Benchmarking isn't about promoting one approach over another but objectively measuring and guiding requirements across any model deployment scenario. It fulfills four essential roles:</p><ol><li><p><strong>Objectively compare solutions</strong>: Benchmarks level the playing field. Rigorous benchmarks applied uniformly to models, infrastructure, and data methods cut through the marketing hype, internal biases, and subjective debates to deliver impartial rankings. However, as &#8220;<a href="https://photosauce.net/blog/post/lies-damned-lies-and-benchmarks-part-1-all-about-that-baseline">Lies, Damned Lies, and Benchmarks</a>&#8221; explains, public benchmarks can be gamed, as tweaking enough variables almost always yields a number that supports an argument, enabling deception and abuse.</p></li><li><p><strong>Make Confident Trade-Off Decisions: </strong>Benchmarks provide clear success criteria, but rarely does one solution outperform others on every front. Instead, they reveal trade-offs. One model may offer higher accuracy but increase latency, while another cuts costs but struggles with out-of-distribution data. Teams can confidently choose the solution that best fits their operational needs using a multi-metric decision framework, such as a weighted scorecard.</p></li><li><p><strong>Measure and Track Progress</strong>: Benchmarking isn't a single, isolated event. Regular, systematic benchmarking establishes historical performance records that help teams:</p><ol><li><p>Quantify incremental improvements.</p></li><li><p>Quickly identify performance regressions.</p></li><li><p>Demonstrate tangible ROI over time.</p></li></ol><p>Consistent benchmarking creates an evidence-backed narrative of continual improvement.</p></li><li><p><strong>Raise the Bar and Advance the Field</strong>: Effective benchmarks set ambitious, motivating targets, pushing teams to innovate continuously. Much like Yeager's aviation milestone, benchmarking sets clear goals that inspire teams to explore beyond established limits, driving continuous improvement and field-wide advancement.</p></li></ol><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!w6ZH!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1fe613b2-eda3-4711-8803-917493e3114b_1600x586.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!w6ZH!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1fe613b2-eda3-4711-8803-917493e3114b_1600x586.png 424w, https://substackcdn.com/image/fetch/$s_!w6ZH!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1fe613b2-eda3-4711-8803-917493e3114b_1600x586.png 848w, https://substackcdn.com/image/fetch/$s_!w6ZH!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1fe613b2-eda3-4711-8803-917493e3114b_1600x586.png 1272w, https://substackcdn.com/image/fetch/$s_!w6ZH!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1fe613b2-eda3-4711-8803-917493e3114b_1600x586.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!w6ZH!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1fe613b2-eda3-4711-8803-917493e3114b_1600x586.png" width="1456" height="533" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/1fe613b2-eda3-4711-8803-917493e3114b_1600x586.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:533,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!w6ZH!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1fe613b2-eda3-4711-8803-917493e3114b_1600x586.png 424w, https://substackcdn.com/image/fetch/$s_!w6ZH!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1fe613b2-eda3-4711-8803-917493e3114b_1600x586.png 848w, https://substackcdn.com/image/fetch/$s_!w6ZH!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1fe613b2-eda3-4711-8803-917493e3114b_1600x586.png 1272w, https://substackcdn.com/image/fetch/$s_!w6ZH!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1fe613b2-eda3-4711-8803-917493e3114b_1600x586.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Organisations consistently apply benchmarks to ensure their machine learning solutions reliably deliver measurable value, regardless of the underlying technology trends.</p><h1><strong>What Counts as a "Benchmark"?</strong></h1><p>A benchmark is a controlled experiment applied to any layer of the ML stack. To isolate the impact of a change, you must hold other variables constant. Benchmarking in ML isn&#8217;t a monolithic activity but a practice that can be broken down into three primary layers: infrastructure/system benchmarks, model benchmarks and data benchmarks.</p><h2><strong>Infrastructure/System Benchmarks</strong></h2><ul><li><p><strong>Goal</strong>: Identify the infrastructure (hardware and software) configuration that delivers reliable, scalable, and efficient model balancing performance, latency, throughput, cost, and energy use.</p></li><li><p><strong>Freeze</strong>: Model, dataset</p></li><li><p><strong>Vary</strong>: Hardware accelerators, precision modes (FP32 vs INT8), compiler optimization, deployment architectures, network topology for distributed training</p></li><li><p><strong>Metrics:</strong> Throughput (queries/sec), latency (p95/p99), cost per inference, energy use, CO&#8322; emissions</p></li><li><p><strong>Established Suites:</strong> <a href="https://mlcommons.org/benchmarks/">MLPerf</a> Training &amp; Inference</p></li><li><p><strong>Real-world Example</strong>: Your baseline is a ResNet-50 model on NVIDIA V100 GPUs with 80 ms latency at $0.001 per inference. Benchmarking shows that migrating to H100 GPUs with INT8 precision reduces latency to 12 ms and cost to $0.0005 per inference, clearly justifying the upgrade.</p></li></ul><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!Je2E!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff3b8f76d-beb6-45a1-9340-447e90e95c85_1575x1182.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!Je2E!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff3b8f76d-beb6-45a1-9340-447e90e95c85_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!Je2E!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff3b8f76d-beb6-45a1-9340-447e90e95c85_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!Je2E!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff3b8f76d-beb6-45a1-9340-447e90e95c85_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!Je2E!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff3b8f76d-beb6-45a1-9340-447e90e95c85_1575x1182.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!Je2E!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff3b8f76d-beb6-45a1-9340-447e90e95c85_1575x1182.png" width="1456" height="1093" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/f3b8f76d-beb6-45a1-9340-447e90e95c85_1575x1182.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1093,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!Je2E!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff3b8f76d-beb6-45a1-9340-447e90e95c85_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!Je2E!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff3b8f76d-beb6-45a1-9340-447e90e95c85_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!Je2E!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff3b8f76d-beb6-45a1-9340-447e90e95c85_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!Je2E!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ff3b8f76d-beb6-45a1-9340-447e90e95c85_1575x1182.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h2><strong>Model Benchmarks</strong></h2><ul><li><p><strong>Goal</strong>: Determine the model architecture and training strategy that maximises predictive power and aligns with business and ethical requirements. Model benchmarks are the traditional and most common type of benchmarks.</p></li><li><p><strong>Freeze</strong>: Dataset, system (infrastructure, including evaluation metrics)</p></li><li><p><strong>Vary:</strong> Model architectures, hyperparameters, feature engineering, and optimisation methods.</p></li><li><p><strong>Metrics:</strong> Accuracy, <a href="https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc#:~:text=The%20area%20under%20the%20ROC,positive%20higher%20than%20the%20negative.">ROC-AUC</a>, <a href="https://blog.roboflow.com/mean-average-precision/">mAP,</a> <a href="https://huggingface.co/spaces/evaluate-metric/bleu">BLEU</a>, risk, and operational readiness (covering robustness, explainability, fairness and bias)</p></li><li><p><strong>Public Yardsticks:</strong> <a href="https://www.image-net.org/">ImageNet </a>, <a href="https://www.cs.toronto.edu/~kriz/cifar.html">CIFAR-10</a>, <a href="https://gluebenchmark.com/">GLUE</a>, <a href="https://www.kaggle.com/">Kaggle</a>/<a href="https://zindi.africa/">Zindi</a> competitions</p></li></ul><p><strong>Real-world Example</strong>: A random forest baseline yields an AUC of 0.81 for churn prediction. A benchmark study reveals that a well-tuned LightGBM model lifts AUC to 0.87, surpassing both the baseline and the operational</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!Vuh0!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed9b841f-2169-43d6-aff4-6df3ed33649c_1575x1182.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!Vuh0!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed9b841f-2169-43d6-aff4-6df3ed33649c_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!Vuh0!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed9b841f-2169-43d6-aff4-6df3ed33649c_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!Vuh0!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed9b841f-2169-43d6-aff4-6df3ed33649c_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!Vuh0!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed9b841f-2169-43d6-aff4-6df3ed33649c_1575x1182.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!Vuh0!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed9b841f-2169-43d6-aff4-6df3ed33649c_1575x1182.png" width="1456" height="1093" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/ed9b841f-2169-43d6-aff4-6df3ed33649c_1575x1182.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1093,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!Vuh0!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed9b841f-2169-43d6-aff4-6df3ed33649c_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!Vuh0!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed9b841f-2169-43d6-aff4-6df3ed33649c_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!Vuh0!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed9b841f-2169-43d6-aff4-6df3ed33649c_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!Vuh0!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed9b841f-2169-43d6-aff4-6df3ed33649c_1575x1182.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h2><strong>Data Benchmarks</strong></h2><ul><li><p><strong>Goal: </strong>Evaluate the ROI of data-centric strategies, quantifying how changes in data directly translate to model performance.</p></li><li><p><strong>Freeze:</strong> Model, system (infrastructure including evaluation metrics)</p></li><li><p><strong>Vary:</strong> Data volume, sampling strategies, annotation quality, synthetic data augmentation, data selection/valuation strategies.</p></li><li><p><strong>Metrics:</strong> Minority-class precision/recall, out-of-distribution robustness, performance uplift per dollar spent on data.</p></li><li><p><strong>Notable Suites:</strong> <a href="https://www.dataperf.org/">DataPerf</a></p></li><li><p><strong>Real-world Example</strong>: You have an image classification model with 75% accuracy. One benchmark evaluates the ROI of adding 50,000 synthetic images, which raises accuracy to 82%. Another evaluates the impact of spending the same budget on expert re-labelling of the 10,000 most-confused examples, which lifts accuracy to 85%. This data-centric benchmark provides a clear path for resource allocation.</p></li></ul><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!pOJG!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3f6644a4-2450-45f2-9ac3-f2a1cd74f6c2_1575x1182.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!pOJG!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3f6644a4-2450-45f2-9ac3-f2a1cd74f6c2_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!pOJG!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3f6644a4-2450-45f2-9ac3-f2a1cd74f6c2_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!pOJG!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3f6644a4-2450-45f2-9ac3-f2a1cd74f6c2_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!pOJG!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3f6644a4-2450-45f2-9ac3-f2a1cd74f6c2_1575x1182.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!pOJG!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3f6644a4-2450-45f2-9ac3-f2a1cd74f6c2_1575x1182.png" width="1456" height="1093" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/3f6644a4-2450-45f2-9ac3-f2a1cd74f6c2_1575x1182.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1093,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!pOJG!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3f6644a4-2450-45f2-9ac3-f2a1cd74f6c2_1575x1182.png 424w, https://substackcdn.com/image/fetch/$s_!pOJG!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3f6644a4-2450-45f2-9ac3-f2a1cd74f6c2_1575x1182.png 848w, https://substackcdn.com/image/fetch/$s_!pOJG!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3f6644a4-2450-45f2-9ac3-f2a1cd74f6c2_1575x1182.png 1272w, https://substackcdn.com/image/fetch/$s_!pOJG!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3f6644a4-2450-45f2-9ac3-f2a1cd74f6c2_1575x1182.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h1><strong>Building or Choosing Your Benchmarks</strong></h1><p>A benchmark&#8217;s value is not in the numbers it produces but in the decisions it enables. Before any experiments are run, the first decision is whether to adopt a public standard or develop a private, internal benchmark. This choice dictates your point of comparison: the broader industry or your own unique business context. Most mature organisations use a hybrid approach, but understanding the trade-offs is key.</p><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/0K9YD/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/5c0facc8-db77-42bf-ab67-dac2ccd5fae7_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:464,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/0K9YD/1/" width="730" height="464" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><p>Once you&#8217;ve chosen your approach, define success with metrics that map directly to business impact. Use a portfolio of measures to capture trade-offs and ensure real-world relevance. With strategy and metrics set, focus on execution: rigorous benchmarking should be reproducible, objective, and decision-driven. Below is a practical workflow to guide each benchmarking run from hypothesis to actionable insight.</p><ol><li><p><strong>Formulate a Falsifiable Hypothesis</strong>: Start with a precise, testable question linked to business KPIs. Replacing our baseline logistic regression model with an XGBoost ensemble will lift ROC-AUC from 0.78 to 0.85, keep median inference latency below 10ms, and is projected to boost monthly revenue by &#163;35k through a 12% reduction in false negatives.</p></li><li><p><strong>Design the Reproducible Protocol</strong>: Meticulously document the "Freeze/Vary" framework, which is the blueprint for reproducibility and versioning.</p></li><li><p><strong>Automate Execution and Logging.</strong> Integrate your benchmarks directly into the CI/CD pipeline and leverage experiment-tracking platforms like MLflow, Weights &amp; Biases, or Vertex AI Experiments to automate execution. These tools capture every parameter, artefact, and result, ensuring a complete, auditable record for future analysis and reproducibility.</p></li><li><p><strong>Analyse, Decide, and Communicate.</strong> Analyse results and tie your findings directly to the original hypothesis and business objectives. Summarise with a clear, evidence-based recommendation whether to ship, refine, or abandon, explicitly referencing both your starting hypothesis and the projected business impact.</p></li></ol><h1><strong>Key Takeaways</strong></h1><p>Benchmarking can be time-consuming and sometimes a bit dispiriting if the numbers aren&#8217;t what you would like. However, it is worth the investment for production systems and will help drive incremental improvement.</p><ul><li><p>Treat benchmarking as a decision-making discipline to drive business impact, manage trade-offs, and de-risk deployments, not just as a metric-gathering task.</p></li><li><p>Benchmarking isn&#8217;t limited to models. It applies across the entire ML stack. Evaluate infrastructure (hardware, deployment, efficiency), models (architecture, predictive power, robustness), and data (quality, quantity, coverage, ROI) to capture a complete, actionable view of system performance.</p></li><li><p>Every benchmark should run through a disciplined loop, formulate a business-driven hypothesis, design a reproducible protocol, automate execution and logging, then analyse results to decide: ship, tweak, or kill.</p></li><li><p>Build lasting trust by measuring performance, ensuring robustness, efficiency, fairness, and compliance, and guaranteeing full reproducibility through strict versioning and automated CI/CD checks.</p></li></ul><div class="pullquote"><p>If you're looking for reliable public benchmarks to help evaluate your ML models, here are some excellent places to start.</p></div><p>For Large Language Models (LLMs), an excellent resource is TechTarget&#8217;s guide, &#8220;<a href="https://www.techtarget.com/searchsoftwarequality/tip/Benchmarking-LLMs-A-guide-to-AI-model-evaluation">Benchmarking LLMs: A guide to AI model evaluation</a>.&#8221; This can be complemented by exploring the <a href="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/">Hugging Face Open LLM Leaderboard</a> for regularly updated rankings of popular models.</p><p>While <a href="https://mlcommons.org/2025/04/mlperf-client-v0-6/">MLPerf</a>, provides widely recognised standards for training and inference performance. For computer vision, benchmarks like <a href="https://www.image-net.org/">ImageNet</a> and <a href="https://cocodataset.org/#home">COCO</a> assess model capability, with newer benchmarks like <a href="https://arxiv.org/html/2505.20612v1">Roboflow100-VL</a> (robustness for vision-language models) is great for testing real-world robustness.</p><p>For tabular data, platforms like <a href="https://www.kaggle.com/competitions">Kaggle</a> and <a href="https://www.openml.org/">OpenML</a> offer vast archives for establishing robust model baselines. <a href="https://zindi.africa/competitions">Zindi</a> complements these resources by providing fresh datasets that address specific business and societal challenges in Africa.</p><div class="pullquote"><p>COMING UP&#8230; In the next article of this series, we'll explore Formal Verification of ML</p></div><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient X Design! Subscribe for free to receive new posts.</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><h1><strong>Further Reading</strong></h1><ul><li><p><a href="https://mlsysbook.ai/contents/core/benchmarking/benchmarking.html#:~:text=Data%20quality%E2%80%99s%20primacy%20in%20AI,197">Machine Learning System: Benchmarking AI</a></p></li><li><p><a href="https://www.google.com/search?q=Scientific+machine+learning+benchmarks%5C&amp;rlz=1C5CHFA_enGB1153GB1153&amp;oq=Scientific+machine+learning+benchmarks%5C&amp;gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIJCAEQABgeGKkGMg0IAhAAGIYDGIAEGIoFMgoIAxAAGIAEGKIEMgoIBBAAGKIEGIkFMgoIBRAAGIAEGKIEMgoIBhAAGIAEGKIEMgcIBxAAGO8F0gEIMTYzOWowajeoAgCwAgA&amp;sourceid=chrome&amp;ie=UTF-8#:~:text=%5B2110.12773%5D%20Scientific%20Machine,org%20%E2%80%BA%20cs">Scientific machine learning benchmarks</a></p></li><li><p><a href="https://arxiv.org/abs/2411.12990#:~:text=depends%20on%20their%20design%20and,assessments%20to%20support%20benchmark%20comparability">BetterBench: Assessing AI Benchmarks, Uncovering Issues, and Establishing Best Practices</a></p></li><li><p><a href="https://overcast.blog/11-ml-performance-benchmarking-tools-you-should-know-20ad8eee9e3a">11 ML Performance Benchmarking Tools You Should Know</a></p></li><li><p><a href="https://www.hbs.edu/faculty/Pages/item.aspx?num=47650#:~:text=Netflix%3A%20Designing%20the%20Netflix%20Prize%20(A),-By%3A%20Karim%20R&amp;text=Hastings%20determined%20that%20a%2010,by%20up%20to%20%2489%20million.">Netflix: Designing the Netflix Prize</a></p></li></ul><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/ml-benchmarking-primer/comments&quot;,&quot;text&quot;:&quot;Leave a comment&quot;,&quot;action&quot;:null,&quot;class&quot;:null}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/ml-benchmarking-primer/comments"><span>Leave a comment</span></a></p><p></p>]]></content:encoded></item><item><title><![CDATA[Validation Begins with Test Design ]]></title><description><![CDATA[Why Your Test Set Is More Than Just a Data Split]]></description><link>https://resilient.safeintelligence.ai/p/validation-begins-with-test-design</link><guid isPermaLink="false">https://resilient.safeintelligence.ai/p/validation-begins-with-test-design</guid><dc:creator><![CDATA[Brain Aboze]]></dc:creator><pubDate>Thu, 17 Jul 2025 07:46:01 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!X2wg!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0ab878be-e7f8-4605-afea-a8405c66e73e_1456x816.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<h1><strong>Table of Contents</strong></h1><ul><li><p><a href="https://resilient.safeintelligence.ai/i/168481926/tldr">TL;DR</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/168481926/getting-test-set-design-right">Getting Test Set Design Right</a></p><ul><li><p><a href="https://resilient.safeintelligence.ai/i/168481926/representativeness">Representativeness</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/168481926/coverage">Coverage</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/168481926/probes-for-model-failure">Probe for Model Failure</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/168481926/maintain-strict-evaluation-boundaries">Maintain Strict Evaluation Boundaries</a></p></li></ul></li><li><p>Test Set Design Techniques</p><ul><li><p><a href="https://resilient.safeintelligence.ai/i/168481926/simple-data-split">Simple Data Split</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/168481926/cross-validationhttps://resilient.safeintelligence.ai/i/168481926/cross-validation">Cross-Validation Techniques</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/168481926/bootstrapping">Bootstrapping</a></p></li></ul></li><li><p><a href="https://resilient.safeintelligence.ai/i/168481926/what-do-evaluation-metrics-really-tell-us">What do evaluation metrics really tell us?</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/168481926/key-takeaways">Key Takeaways</a></p></li><li><p><a href="https://resilient.safeintelligence.ai/i/168481926/further-reading">Further reading</a></p></li></ul><h1>TL;DR</h1><blockquote><p><em>Hey there, welcome back to the series. In the last post, we talked about why testing machine learning (ML) feels more like uncertain geology than precise geometry. This piece builds on that, arguing that real model validation&#8212;the process that makes a model truly ready for the real world&#8212;doesn't start with chasing a high performance score. Instead, it begins with deliberately designing your test sets. A test set is more than just the data left over after training. Real confidence comes from strategically engineering your test data to be a tough, real-world simulator that actively seeks out your model's breaking points.</em></p></blockquote><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!X2wg!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0ab878be-e7f8-4605-afea-a8405c66e73e_1456x816.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!X2wg!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0ab878be-e7f8-4605-afea-a8405c66e73e_1456x816.png 424w, https://substackcdn.com/image/fetch/$s_!X2wg!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0ab878be-e7f8-4605-afea-a8405c66e73e_1456x816.png 848w, https://substackcdn.com/image/fetch/$s_!X2wg!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0ab878be-e7f8-4605-afea-a8405c66e73e_1456x816.png 1272w, https://substackcdn.com/image/fetch/$s_!X2wg!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0ab878be-e7f8-4605-afea-a8405c66e73e_1456x816.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!X2wg!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0ab878be-e7f8-4605-afea-a8405c66e73e_1456x816.png" width="1456" height="816" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/0ab878be-e7f8-4605-afea-a8405c66e73e_1456x816.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:816,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!X2wg!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0ab878be-e7f8-4605-afea-a8405c66e73e_1456x816.png 424w, https://substackcdn.com/image/fetch/$s_!X2wg!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0ab878be-e7f8-4605-afea-a8405c66e73e_1456x816.png 848w, https://substackcdn.com/image/fetch/$s_!X2wg!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0ab878be-e7f8-4605-afea-a8405c66e73e_1456x816.png 1272w, https://substackcdn.com/image/fetch/$s_!X2wg!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0ab878be-e7f8-4605-afea-a8405c66e73e_1456x816.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><div class="pullquote"><p><em>&#8220;All models are wrong, but some are useful.&#8221; &#8211; George E. P. Box</em></p></div><p>No matter how advanced techniques become, every ML model remains inherently an approximation of reality. Recognising this essential limitation sets the context for why rigorous validation is not merely desirable but indispensable. This inherent imperfection raises the critical question: <em><strong>How good is good enough?</strong></em> The answer lies in building <strong>confidence, </strong>specifically, confidence that the model will reliably perform as intended in real-world operational conditions. This essential pre-deployment confidence is forged through <strong>validation.</strong></p><p>Validation is a comprehensive assessment of all aspects of a model&#8217;s design, development, and final performance to ensure its readiness for deployment. Within this validation framework, two distinct technical processes are critical: testing and verification.</p><ul><li><p><strong>Testing</strong> involves running inference on curated datasets to evaluate model performance against specific, predefined criteria in the scope of supervised learning.</p></li><li><p><strong>Verification</strong>, briefly introduced here but to be explored in detail in a later part of this series, uses formal mathematical methods to prove or establish high confidence in critical properties.</p></li></ul><p>Before continuing, let's clarify our scope. In this series, we define validation as the technical, <em>pre-deployment</em> assurance process. While some include monitoring or governance under this umbrella, we treat them as separate concerns (post-deployment assurance). Validation focuses strictly on testing and verification that determine whether a model is ready for deployment.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!nTyG!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6a9f958f-859b-46ce-90d1-a763bc7a39c1_1600x938.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!nTyG!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6a9f958f-859b-46ce-90d1-a763bc7a39c1_1600x938.png 424w, https://substackcdn.com/image/fetch/$s_!nTyG!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6a9f958f-859b-46ce-90d1-a763bc7a39c1_1600x938.png 848w, https://substackcdn.com/image/fetch/$s_!nTyG!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6a9f958f-859b-46ce-90d1-a763bc7a39c1_1600x938.png 1272w, https://substackcdn.com/image/fetch/$s_!nTyG!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6a9f958f-859b-46ce-90d1-a763bc7a39c1_1600x938.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!nTyG!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6a9f958f-859b-46ce-90d1-a763bc7a39c1_1600x938.png" width="1456" height="854" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/6a9f958f-859b-46ce-90d1-a763bc7a39c1_1600x938.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:854,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!nTyG!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6a9f958f-859b-46ce-90d1-a763bc7a39c1_1600x938.png 424w, https://substackcdn.com/image/fetch/$s_!nTyG!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6a9f958f-859b-46ce-90d1-a763bc7a39c1_1600x938.png 848w, https://substackcdn.com/image/fetch/$s_!nTyG!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6a9f958f-859b-46ce-90d1-a763bc7a39c1_1600x938.png 1272w, https://substackcdn.com/image/fetch/$s_!nTyG!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F6a9f958f-859b-46ce-90d1-a763bc7a39c1_1600x938.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h1><strong>Getting Test Set Design Right</strong></h1><p>A test set is not just a leftover slice of data, nor is its worth measured by size or a single performance score. Its fundamental purpose is to expose whether a model is truly deployment-ready by surfacing the necessary vulnerabilities. To achieve that, a well-designed test set rests on four pillars: representativeness of the production environment, exhaustive coverage of mission-critical use cases, targeted probes that hunt for hidden failure modes, and strict isolation from training data to preserve evaluation integrity.</p><h2><strong>Representativeness</strong></h2><p>A test set must resemble the world the model is about to enter so that its metrics give an <strong>unbiased, deployment-ready performance estimate</strong>. This demands disciplined sampling: include every region, population, or channel the model will encounter; balance rare but critical classes to avoid optimism that hides failure; and, when data are scarce, augment the training pool with synthetic examples so real-world instances can be reserved for evaluation. Furthermore, representativeness is temporal; a test set built on year-old data is a poor proxy for today's reality.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!n0LO!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F45c1fa69-abc7-40e0-a969-a2ca641a3ab9_1600x1025.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!n0LO!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F45c1fa69-abc7-40e0-a969-a2ca641a3ab9_1600x1025.png 424w, https://substackcdn.com/image/fetch/$s_!n0LO!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F45c1fa69-abc7-40e0-a969-a2ca641a3ab9_1600x1025.png 848w, https://substackcdn.com/image/fetch/$s_!n0LO!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F45c1fa69-abc7-40e0-a969-a2ca641a3ab9_1600x1025.png 1272w, https://substackcdn.com/image/fetch/$s_!n0LO!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F45c1fa69-abc7-40e0-a969-a2ca641a3ab9_1600x1025.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!n0LO!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F45c1fa69-abc7-40e0-a969-a2ca641a3ab9_1600x1025.png" width="1456" height="933" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/45c1fa69-abc7-40e0-a969-a2ca641a3ab9_1600x1025.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:933,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!n0LO!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F45c1fa69-abc7-40e0-a969-a2ca641a3ab9_1600x1025.png 424w, https://substackcdn.com/image/fetch/$s_!n0LO!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F45c1fa69-abc7-40e0-a969-a2ca641a3ab9_1600x1025.png 848w, https://substackcdn.com/image/fetch/$s_!n0LO!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F45c1fa69-abc7-40e0-a969-a2ca641a3ab9_1600x1025.png 1272w, https://substackcdn.com/image/fetch/$s_!n0LO!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F45c1fa69-abc7-40e0-a969-a2ca641a3ab9_1600x1025.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Test set representativeness</figcaption></figure></div><h2><strong>Coverage</strong></h2><p>Typical test sets evaluate model performance at discrete data points, leaving vast, unexplored regions around these points. Models may appear highly performant at isolated test points yet fail significantly when slight variations occur. Therefore, robust test design must encompass broader regions around key test points, ensuring performance stability across relevant data neighbourhoods.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!Qwoa!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3e8663e1-2779-48cf-85e2-06f3f9a6864b_1600x1012.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!Qwoa!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3e8663e1-2779-48cf-85e2-06f3f9a6864b_1600x1012.png 424w, https://substackcdn.com/image/fetch/$s_!Qwoa!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3e8663e1-2779-48cf-85e2-06f3f9a6864b_1600x1012.png 848w, https://substackcdn.com/image/fetch/$s_!Qwoa!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3e8663e1-2779-48cf-85e2-06f3f9a6864b_1600x1012.png 1272w, https://substackcdn.com/image/fetch/$s_!Qwoa!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3e8663e1-2779-48cf-85e2-06f3f9a6864b_1600x1012.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!Qwoa!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3e8663e1-2779-48cf-85e2-06f3f9a6864b_1600x1012.png" width="1456" height="921" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/3e8663e1-2779-48cf-85e2-06f3f9a6864b_1600x1012.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:921,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!Qwoa!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3e8663e1-2779-48cf-85e2-06f3f9a6864b_1600x1012.png 424w, https://substackcdn.com/image/fetch/$s_!Qwoa!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3e8663e1-2779-48cf-85e2-06f3f9a6864b_1600x1012.png 848w, https://substackcdn.com/image/fetch/$s_!Qwoa!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3e8663e1-2779-48cf-85e2-06f3f9a6864b_1600x1012.png 1272w, https://substackcdn.com/image/fetch/$s_!Qwoa!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F3e8663e1-2779-48cf-85e2-06f3f9a6864b_1600x1012.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Standard testing points and unexplored regions</figcaption></figure></div><p>A well-designed test set must go far beyond the &#8220;happy-path&#8221; cases where a model is already comfortable. It should deliberately sample the <em>whole</em> operational landscape, cutting across routine inputs <em>and</em> the rare, high-impact situations that can break a system. Work with domain experts to inject boundary conditions, edge cases, and infrequent but costly events (e.g., large fraud attempts). Slice scenarios by real-world stressors such as peak load and holidays&#8212;so you see how performance shifts when the stakes are highest. Only this breadth of coverage reveals whether the model is ready for production.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!m_v5!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffc24d60b-4c2d-41a6-943d-201c3eebf9e5_1600x1038.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!m_v5!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffc24d60b-4c2d-41a6-943d-201c3eebf9e5_1600x1038.png 424w, https://substackcdn.com/image/fetch/$s_!m_v5!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffc24d60b-4c2d-41a6-943d-201c3eebf9e5_1600x1038.png 848w, https://substackcdn.com/image/fetch/$s_!m_v5!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffc24d60b-4c2d-41a6-943d-201c3eebf9e5_1600x1038.png 1272w, https://substackcdn.com/image/fetch/$s_!m_v5!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffc24d60b-4c2d-41a6-943d-201c3eebf9e5_1600x1038.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!m_v5!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffc24d60b-4c2d-41a6-943d-201c3eebf9e5_1600x1038.png" width="1456" height="945" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/fc24d60b-4c2d-41a6-943d-201c3eebf9e5_1600x1038.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:945,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!m_v5!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffc24d60b-4c2d-41a6-943d-201c3eebf9e5_1600x1038.png 424w, https://substackcdn.com/image/fetch/$s_!m_v5!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffc24d60b-4c2d-41a6-943d-201c3eebf9e5_1600x1038.png 848w, https://substackcdn.com/image/fetch/$s_!m_v5!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffc24d60b-4c2d-41a6-943d-201c3eebf9e5_1600x1038.png 1272w, https://substackcdn.com/image/fetch/$s_!m_v5!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Ffc24d60b-4c2d-41a6-943d-201c3eebf9e5_1600x1038.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Standard tests and edge cases</figcaption></figure></div><h2><strong>Probes for Model Failure</strong></h2><p>An effective test set must actively probe for model failures, not just confirm the model works. This requires designing specific inputs to uncover hidden weaknesses and logical inconsistencies. One approach is behavioural testing, where we verify the model's reasoning: does its output remain correctly unchanged when an irrelevant input is modified (invariance), and does it change predictably when a relevant one is adjusted (directional expectation)? The other approach is stress testing for stability, assessing performance on inputs with real-world imperfections like poor lighting, out-of-focus images, or noisy data. The most rigorous form of this is adversarial testing, which uses inputs carefully crafted to find a model's breaking points. If a test set never challenges the model, you&#8217;re not truly validating; it&#8217;s just performance theatre.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!KOLN!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0388789b-9a70-4b1c-8e91-223c7ffa1405_1600x1038.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!KOLN!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0388789b-9a70-4b1c-8e91-223c7ffa1405_1600x1038.png 424w, https://substackcdn.com/image/fetch/$s_!KOLN!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0388789b-9a70-4b1c-8e91-223c7ffa1405_1600x1038.png 848w, https://substackcdn.com/image/fetch/$s_!KOLN!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0388789b-9a70-4b1c-8e91-223c7ffa1405_1600x1038.png 1272w, https://substackcdn.com/image/fetch/$s_!KOLN!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0388789b-9a70-4b1c-8e91-223c7ffa1405_1600x1038.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!KOLN!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0388789b-9a70-4b1c-8e91-223c7ffa1405_1600x1038.png" width="1456" height="945" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/0388789b-9a70-4b1c-8e91-223c7ffa1405_1600x1038.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:945,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!KOLN!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0388789b-9a70-4b1c-8e91-223c7ffa1405_1600x1038.png 424w, https://substackcdn.com/image/fetch/$s_!KOLN!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0388789b-9a70-4b1c-8e91-223c7ffa1405_1600x1038.png 848w, https://substackcdn.com/image/fetch/$s_!KOLN!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0388789b-9a70-4b1c-8e91-223c7ffa1405_1600x1038.png 1272w, https://substackcdn.com/image/fetch/$s_!KOLN!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0388789b-9a70-4b1c-8e91-223c7ffa1405_1600x1038.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Integrating adversarial examples into testing</figcaption></figure></div><h2><strong>Maintain Strict Evaluation Boundaries</strong></h2><p>Isolation is non-negotiable; partition the test set at project inception, lock it away, and never let it contaminate training or model-selection steps. This absolute separation prevents all forms of data leakage, from direct sample overlap to subtle information bleed through proxy variables or shared preprocessing artefacts. Ultimately, this disciplined isolation is the only way to guarantee that the test set provides a true, unbiased measure of generalisation performance, ensuring you evaluate a model's ability to perform on unseen data, not merely its memorisation capacity.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!k8Zv!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0640c979-4c88-432f-b1fd-dc7fe551980f_1600x888.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!k8Zv!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0640c979-4c88-432f-b1fd-dc7fe551980f_1600x888.png 424w, https://substackcdn.com/image/fetch/$s_!k8Zv!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0640c979-4c88-432f-b1fd-dc7fe551980f_1600x888.png 848w, https://substackcdn.com/image/fetch/$s_!k8Zv!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0640c979-4c88-432f-b1fd-dc7fe551980f_1600x888.png 1272w, https://substackcdn.com/image/fetch/$s_!k8Zv!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0640c979-4c88-432f-b1fd-dc7fe551980f_1600x888.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!k8Zv!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0640c979-4c88-432f-b1fd-dc7fe551980f_1600x888.png" width="1456" height="808" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/0640c979-4c88-432f-b1fd-dc7fe551980f_1600x888.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:808,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!k8Zv!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0640c979-4c88-432f-b1fd-dc7fe551980f_1600x888.png 424w, https://substackcdn.com/image/fetch/$s_!k8Zv!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0640c979-4c88-432f-b1fd-dc7fe551980f_1600x888.png 848w, https://substackcdn.com/image/fetch/$s_!k8Zv!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0640c979-4c88-432f-b1fd-dc7fe551980f_1600x888.png 1272w, https://substackcdn.com/image/fetch/$s_!k8Zv!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F0640c979-4c88-432f-b1fd-dc7fe551980f_1600x888.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Data leakage</figcaption></figure></div><p></p><p></p><h1><strong>Test Set Design Techniques</strong></h1><p>While the principle of isolation demands a strict separation of test data, the <em>method</em> of this separation is a critical design decision. A poorly chosen split can introduce high bias (underfitting), leading the model to oversimplify, missing important complexities in real-world data, or, conversely, high variance (overfitting), causing the model to memorise specifics rather than general patterns. Although it appears accurate during training, it performs poorly on new, unseen data.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!MgnE!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa7a12e2b-a839-4007-aeb1-b08db90969c5_1304x669.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!MgnE!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa7a12e2b-a839-4007-aeb1-b08db90969c5_1304x669.png 424w, https://substackcdn.com/image/fetch/$s_!MgnE!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa7a12e2b-a839-4007-aeb1-b08db90969c5_1304x669.png 848w, https://substackcdn.com/image/fetch/$s_!MgnE!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa7a12e2b-a839-4007-aeb1-b08db90969c5_1304x669.png 1272w, https://substackcdn.com/image/fetch/$s_!MgnE!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa7a12e2b-a839-4007-aeb1-b08db90969c5_1304x669.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!MgnE!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa7a12e2b-a839-4007-aeb1-b08db90969c5_1304x669.png" width="1304" height="669" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/a7a12e2b-a839-4007-aeb1-b08db90969c5_1304x669.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:669,&quot;width&quot;:1304,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!MgnE!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa7a12e2b-a839-4007-aeb1-b08db90969c5_1304x669.png 424w, https://substackcdn.com/image/fetch/$s_!MgnE!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa7a12e2b-a839-4007-aeb1-b08db90969c5_1304x669.png 848w, https://substackcdn.com/image/fetch/$s_!MgnE!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa7a12e2b-a839-4007-aeb1-b08db90969c5_1304x669.png 1272w, https://substackcdn.com/image/fetch/$s_!MgnE!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fa7a12e2b-a839-4007-aeb1-b08db90969c5_1304x669.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Underfitting and overfitting</figcaption></figure></div><p>Test set design is not an afterthought; when crafted efficiently, it drives the model toward genuine production readiness and gives stakeholders evidence they can trust. That rigour starts with how you carve raw data into a training slice for model building and an isolated slice for evaluation. Below are the principal splitting strategies:</p><h2><strong>Simple Data Split</strong></h2><p>A straightforward way to partition your data for training and evaluation is by using either a <strong>train&#8211;test</strong> or a <strong>train&#8211;validation&#8211;test</strong> split. In the latter, a <strong>validation set</strong> is specifically used for hyperparameter tuning, model selection, and iterative improvements, while the <strong>test set</strong> is held out until the very end for a final, unbiased performance check.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!fhyj!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcec3396f-f8a6-49bc-87d9-3dc2a7487c2e_1222x561.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!fhyj!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcec3396f-f8a6-49bc-87d9-3dc2a7487c2e_1222x561.png 424w, https://substackcdn.com/image/fetch/$s_!fhyj!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcec3396f-f8a6-49bc-87d9-3dc2a7487c2e_1222x561.png 848w, https://substackcdn.com/image/fetch/$s_!fhyj!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcec3396f-f8a6-49bc-87d9-3dc2a7487c2e_1222x561.png 1272w, https://substackcdn.com/image/fetch/$s_!fhyj!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcec3396f-f8a6-49bc-87d9-3dc2a7487c2e_1222x561.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!fhyj!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcec3396f-f8a6-49bc-87d9-3dc2a7487c2e_1222x561.png" width="1222" height="561" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/cec3396f-f8a6-49bc-87d9-3dc2a7487c2e_1222x561.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:561,&quot;width&quot;:1222,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!fhyj!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcec3396f-f8a6-49bc-87d9-3dc2a7487c2e_1222x561.png 424w, https://substackcdn.com/image/fetch/$s_!fhyj!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcec3396f-f8a6-49bc-87d9-3dc2a7487c2e_1222x561.png 848w, https://substackcdn.com/image/fetch/$s_!fhyj!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcec3396f-f8a6-49bc-87d9-3dc2a7487c2e_1222x561.png 1272w, https://substackcdn.com/image/fetch/$s_!fhyj!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fcec3396f-f8a6-49bc-87d9-3dc2a7487c2e_1222x561.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Simple data split</figcaption></figure></div><p>A common split ratio is <strong>70% training</strong>, <strong>15% validation</strong>, and <strong>15% test</strong>. If you only need a high-level performance estimate, a <strong>70&#8211;30</strong> train&#8211;test split can suffice.</p><h2><strong>Cross Validation Techniques</strong></h2><p>While a simple data split is easy to implement, it can yield unstable or biased estimates, particularly if your dataset is small or not representative of the underlying distribution. Cross-validation overcomes these limitations by repeatedly training and evaluating the model on different subsets, offering a more robust view of how it will perform on unseen data. Crucially, every portion of the data is used for training and validation/testing at some stage, leading to more comprehensive performance metrics. Although cross-validation requires multiple training runs and can be computationally expensive&#8212;especially for large datasets&#8212;it often provides more reliable insights into real-world performance. It is especially valuable when you have limited data, allowing you to maximize both training and evaluation opportunities. Let&#8217;s look at some form of cross-validation:</p><h3><strong>k-Fold Cross-Validation</strong></h3><p>This cross-validation strategy splits the dataset into &#8216;<em>k</em>&#8217; roughly equal &#8216;folds.&#8217; In each of the k iterations, one fold is used as the validation/test set, while the remaining folds are used for training. Performance metrics are average over the k-folds for a more stable estimate</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!l2df!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2442d1ff-5788-46fb-ba2a-2eb40e1691ea_1222x601.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!l2df!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2442d1ff-5788-46fb-ba2a-2eb40e1691ea_1222x601.png 424w, https://substackcdn.com/image/fetch/$s_!l2df!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2442d1ff-5788-46fb-ba2a-2eb40e1691ea_1222x601.png 848w, https://substackcdn.com/image/fetch/$s_!l2df!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2442d1ff-5788-46fb-ba2a-2eb40e1691ea_1222x601.png 1272w, https://substackcdn.com/image/fetch/$s_!l2df!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2442d1ff-5788-46fb-ba2a-2eb40e1691ea_1222x601.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!l2df!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2442d1ff-5788-46fb-ba2a-2eb40e1691ea_1222x601.png" width="1222" height="601" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/2442d1ff-5788-46fb-ba2a-2eb40e1691ea_1222x601.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:601,&quot;width&quot;:1222,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!l2df!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2442d1ff-5788-46fb-ba2a-2eb40e1691ea_1222x601.png 424w, https://substackcdn.com/image/fetch/$s_!l2df!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2442d1ff-5788-46fb-ba2a-2eb40e1691ea_1222x601.png 848w, https://substackcdn.com/image/fetch/$s_!l2df!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2442d1ff-5788-46fb-ba2a-2eb40e1691ea_1222x601.png 1272w, https://substackcdn.com/image/fetch/$s_!l2df!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F2442d1ff-5788-46fb-ba2a-2eb40e1691ea_1222x601.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">K-fold cross-validation</figcaption></figure></div><h3><strong>Repeated k-Fold Cross-Validation</strong></h3><p>This strategy involves performing k-fold cross-validation multiple times with different random splits. This approach further reduces variance in the performance estimate by averaging results across all runs. The &#8216;<em>k</em>&#8216; here doesn&#8217;t mean the number of folds but represents the number of times you repeat the entire training&#8211;validation process with different random splits.</p><h3><strong>Stratified k-Fold Cross-Validation</strong></h3><p>This strategy is a variant of k-fold cross-validation, where each fold is created in a way that preserves the overall class distribution. In simple terms, if you have an imbalanced dataset with the target of 60% negative, 30% positive, and 10% neutral, each fold will aim to have roughly 60% negative, 30% positive, and 10% neutral samples. &#8203;&#8203;If you simply shuffle your data randomly, there is a chance that one fold may end up with more positive samples than another. Stratified k-fold tries to avoid that, as data imbalance can skew how you measure performance. Stratification ensures that every fold reflects the true proportions of each class (like a mini version of the full dataset).</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!XIwR!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F70d30489-f77a-4c08-9946-1756edeaad2b_1226x669.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!XIwR!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F70d30489-f77a-4c08-9946-1756edeaad2b_1226x669.png 424w, https://substackcdn.com/image/fetch/$s_!XIwR!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F70d30489-f77a-4c08-9946-1756edeaad2b_1226x669.png 848w, https://substackcdn.com/image/fetch/$s_!XIwR!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F70d30489-f77a-4c08-9946-1756edeaad2b_1226x669.png 1272w, https://substackcdn.com/image/fetch/$s_!XIwR!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F70d30489-f77a-4c08-9946-1756edeaad2b_1226x669.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!XIwR!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F70d30489-f77a-4c08-9946-1756edeaad2b_1226x669.png" width="1226" height="669" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/70d30489-f77a-4c08-9946-1756edeaad2b_1226x669.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:669,&quot;width&quot;:1226,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!XIwR!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F70d30489-f77a-4c08-9946-1756edeaad2b_1226x669.png 424w, https://substackcdn.com/image/fetch/$s_!XIwR!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F70d30489-f77a-4c08-9946-1756edeaad2b_1226x669.png 848w, https://substackcdn.com/image/fetch/$s_!XIwR!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F70d30489-f77a-4c08-9946-1756edeaad2b_1226x669.png 1272w, https://substackcdn.com/image/fetch/$s_!XIwR!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F70d30489-f77a-4c08-9946-1756edeaad2b_1226x669.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Stratified k-Fold cross-validation</figcaption></figure></div><h3><strong>Leave-One-Out Cross Validation (LOOCV)</strong></h3><p>Leave-one-out cross-validation (LOOCV) is a case of k-fold where <em>k</em> equals the number of data points (<em>n</em>). Each iteration uses one sample as the test set, while the remaining <em>n-1</em> samples form the training set. A model is trained on these <em>n &#8722; 1</em> samples and validated on the single held-out sample. This procedure is repeated n times&#8212;once for each data point as the test set&#8212;and the final performance metric is the average across all iterations. This maximizes your data for training, often leading to lower bias in the estimate. Training the model <em>n</em> times can be extremely computationally expensive, especially for large datasets. <strong>Leave-p-Out Cross-Validation (LpOC)</strong> generalizes LOOCV by leaving out <em>p</em> samples instead of one. This strategy requires training on all remaining data points for each possible selection of <em>p</em> samples, and it is still computationally intensive.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!tE_z!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c0bcdc-4a32-40d5-bb94-750e6a389120_1554x828.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!tE_z!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c0bcdc-4a32-40d5-bb94-750e6a389120_1554x828.png 424w, https://substackcdn.com/image/fetch/$s_!tE_z!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c0bcdc-4a32-40d5-bb94-750e6a389120_1554x828.png 848w, https://substackcdn.com/image/fetch/$s_!tE_z!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c0bcdc-4a32-40d5-bb94-750e6a389120_1554x828.png 1272w, https://substackcdn.com/image/fetch/$s_!tE_z!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c0bcdc-4a32-40d5-bb94-750e6a389120_1554x828.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!tE_z!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c0bcdc-4a32-40d5-bb94-750e6a389120_1554x828.png" width="1456" height="776" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/c1c0bcdc-4a32-40d5-bb94-750e6a389120_1554x828.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:776,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!tE_z!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c0bcdc-4a32-40d5-bb94-750e6a389120_1554x828.png 424w, https://substackcdn.com/image/fetch/$s_!tE_z!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c0bcdc-4a32-40d5-bb94-750e6a389120_1554x828.png 848w, https://substackcdn.com/image/fetch/$s_!tE_z!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c0bcdc-4a32-40d5-bb94-750e6a389120_1554x828.png 1272w, https://substackcdn.com/image/fetch/$s_!tE_z!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c0bcdc-4a32-40d5-bb94-750e6a389120_1554x828.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Leave-One-Out cross validation</figcaption></figure></div><h3><strong>Monte Carlo (Repeated Random Sub-Sampling) Cross-Validation</strong></h3><p>Monte Carlo cross-validation creates multiple, independent train-test splits by repeatedly sampling the data at a fixed, predefined proportion (e.g., 70:20). For each split, a new model is trained from scratch and evaluated, with the final performance metric being the average across all iterations. While highly flexible, this method's primary trade-off is its lack of guaranteed coverage, as some data points may be tested multiple times while others are never selected for a test set at all.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!zc5y!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4369e63c-4ac8-499d-aafc-1df46e99df18_1342x728.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!zc5y!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4369e63c-4ac8-499d-aafc-1df46e99df18_1342x728.png 424w, https://substackcdn.com/image/fetch/$s_!zc5y!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4369e63c-4ac8-499d-aafc-1df46e99df18_1342x728.png 848w, https://substackcdn.com/image/fetch/$s_!zc5y!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4369e63c-4ac8-499d-aafc-1df46e99df18_1342x728.png 1272w, https://substackcdn.com/image/fetch/$s_!zc5y!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4369e63c-4ac8-499d-aafc-1df46e99df18_1342x728.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!zc5y!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4369e63c-4ac8-499d-aafc-1df46e99df18_1342x728.png" width="1342" height="728" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/4369e63c-4ac8-499d-aafc-1df46e99df18_1342x728.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:728,&quot;width&quot;:1342,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!zc5y!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4369e63c-4ac8-499d-aafc-1df46e99df18_1342x728.png 424w, https://substackcdn.com/image/fetch/$s_!zc5y!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4369e63c-4ac8-499d-aafc-1df46e99df18_1342x728.png 848w, https://substackcdn.com/image/fetch/$s_!zc5y!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4369e63c-4ac8-499d-aafc-1df46e99df18_1342x728.png 1272w, https://substackcdn.com/image/fetch/$s_!zc5y!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4369e63c-4ac8-499d-aafc-1df46e99df18_1342x728.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Monte carlo cross-validation</figcaption></figure></div><h3><strong> Nested Cross-Validation</strong></h3><p>Using the same cross-validation procedure and dataset for both tuning and final evaluation can result in overly optimistic and biased estimates. Nested CV aims to prevent this by embedding one cross-validation loop within another to prevent information leakage between hyperparameter tuning and performance assessment. To avoid this, the <strong>outer loop</strong> splits the data into <em>k</em> folds, designating a fold(s) as the test fold and the remaining folds for training. Inside each outer training set, an <strong>inner loop</strong> performs its cross-validation purely for hyperparameter tuning, selecting the best model configuration. The tuned model is then evaluated on the outer test fold. This process repeats for each fold in the outer loop, and the results are averaged to provide an unbiased performance estimate. Although more computationally demanding, nested cross-validation is invaluable for reliably comparing models and hyperparameter settings.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!v1tm!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ab1cda5-3676-438e-aefd-a072fc367354_1600x968.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!v1tm!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ab1cda5-3676-438e-aefd-a072fc367354_1600x968.png 424w, https://substackcdn.com/image/fetch/$s_!v1tm!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ab1cda5-3676-438e-aefd-a072fc367354_1600x968.png 848w, https://substackcdn.com/image/fetch/$s_!v1tm!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ab1cda5-3676-438e-aefd-a072fc367354_1600x968.png 1272w, https://substackcdn.com/image/fetch/$s_!v1tm!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ab1cda5-3676-438e-aefd-a072fc367354_1600x968.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!v1tm!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ab1cda5-3676-438e-aefd-a072fc367354_1600x968.png" width="1456" height="881" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/4ab1cda5-3676-438e-aefd-a072fc367354_1600x968.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:881,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!v1tm!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ab1cda5-3676-438e-aefd-a072fc367354_1600x968.png 424w, https://substackcdn.com/image/fetch/$s_!v1tm!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ab1cda5-3676-438e-aefd-a072fc367354_1600x968.png 848w, https://substackcdn.com/image/fetch/$s_!v1tm!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ab1cda5-3676-438e-aefd-a072fc367354_1600x968.png 1272w, https://substackcdn.com/image/fetch/$s_!v1tm!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F4ab1cda5-3676-438e-aefd-a072fc367354_1600x968.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Nested cross validation</figcaption></figure></div><h3><strong>Time series Cross-Validation</strong></h3><p>These cross-validation strategies are designed to respect the chronological order of data, ensuring that only past observations are used to predict future outcomes. Two main approaches are a sliding window and an expanding window. In the sliding window approach, a fixed-size window of the most recent observations is used for training, then the window shifts forward to discard older data and include new arrivals. This strategy keeps the model focused on recent trends but may lose potentially useful historical information. Conversely, the expanding window method starts with an initial set of observations and grows over time, retaining all prior data. While this strategy preserves the full historical context, older data might become less relevant, and the training set can grow unwieldy.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!GbhN!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c8e112-4070-4788-8bdf-1d5a675a3028_1512x1600.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!GbhN!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c8e112-4070-4788-8bdf-1d5a675a3028_1512x1600.png 424w, https://substackcdn.com/image/fetch/$s_!GbhN!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c8e112-4070-4788-8bdf-1d5a675a3028_1512x1600.png 848w, https://substackcdn.com/image/fetch/$s_!GbhN!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c8e112-4070-4788-8bdf-1d5a675a3028_1512x1600.png 1272w, https://substackcdn.com/image/fetch/$s_!GbhN!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c8e112-4070-4788-8bdf-1d5a675a3028_1512x1600.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!GbhN!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c8e112-4070-4788-8bdf-1d5a675a3028_1512x1600.png" width="1456" height="1541" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/c1c8e112-4070-4788-8bdf-1d5a675a3028_1512x1600.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1541,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!GbhN!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c8e112-4070-4788-8bdf-1d5a675a3028_1512x1600.png 424w, https://substackcdn.com/image/fetch/$s_!GbhN!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c8e112-4070-4788-8bdf-1d5a675a3028_1512x1600.png 848w, https://substackcdn.com/image/fetch/$s_!GbhN!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c8e112-4070-4788-8bdf-1d5a675a3028_1512x1600.png 1272w, https://substackcdn.com/image/fetch/$s_!GbhN!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fc1c8e112-4070-4788-8bdf-1d5a675a3028_1512x1600.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Time-series cross-validation</figcaption></figure></div><p>Time series cross-validation simulates real-world forecasting scenarios by providing multiple, chronologically consistent validation points. Additionally, the overlapping training sets can drive up computational costs, particularly for large datasets.</p><h2><strong>Bootstrapping</strong></h2><p>Bootstrapping is a resampling method for estimating model performance by repeatedly drawing new training sets with replacements from the original dataset. This strategy simply means each time you pick a data point for training, you place it &#8220;back&#8221; into the pool so you can choose the same point again in subsequent draws. Each &#8220;bootstrap&#8221; produces a dataset the same size as the original but with duplicated entries. The <strong>out-of-bag</strong> (OOB) samples&#8212;instances not chosen in that draw&#8212;serve as the test set. After multiple iterations, you <strong>average the OOB results</strong> to estimate performance. Bootstrapping, by contrast, may repeatedly sample certain points while omitting others, which can lead to a bias unless adjusted (e.g., via <a href="https://rasbt.github.io/mlxtend/user_guide/evaluate/bootstrap_point632_score/">.632/.632+</a> methods).</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!Sg_N!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f7df6a3-c828-4c01-b74c-c98488297c39_1600x703.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!Sg_N!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f7df6a3-c828-4c01-b74c-c98488297c39_1600x703.png 424w, https://substackcdn.com/image/fetch/$s_!Sg_N!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f7df6a3-c828-4c01-b74c-c98488297c39_1600x703.png 848w, https://substackcdn.com/image/fetch/$s_!Sg_N!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f7df6a3-c828-4c01-b74c-c98488297c39_1600x703.png 1272w, https://substackcdn.com/image/fetch/$s_!Sg_N!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f7df6a3-c828-4c01-b74c-c98488297c39_1600x703.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!Sg_N!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f7df6a3-c828-4c01-b74c-c98488297c39_1600x703.png" width="1456" height="640" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/5f7df6a3-c828-4c01-b74c-c98488297c39_1600x703.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:640,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!Sg_N!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f7df6a3-c828-4c01-b74c-c98488297c39_1600x703.png 424w, https://substackcdn.com/image/fetch/$s_!Sg_N!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f7df6a3-c828-4c01-b74c-c98488297c39_1600x703.png 848w, https://substackcdn.com/image/fetch/$s_!Sg_N!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f7df6a3-c828-4c01-b74c-c98488297c39_1600x703.png 1272w, https://substackcdn.com/image/fetch/$s_!Sg_N!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5f7df6a3-c828-4c01-b74c-c98488297c39_1600x703.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Bootstrapping</figcaption></figure></div><h1><strong>What do evaluation metrics really tell us?</strong></h1><p>We often prioritise evaluation metrics like accuracy, precision, recall, F1 score, or RMSE when evaluating a model's health. See the<a href="https://neptune.ai/blog/performance-metrics-in-machine-learning-complete-guide"> Neptune AI guide on performance metrics</a> for an excellent deep dive into how different metrics work and why they matter. However, it's crucial to remember these numbers are merely proxies&#8212;indirect measures of a model's true effectiveness in real-world scenarios. This is because direct measurement of real-world impact is often impractical during development; we optimise against these surrogate metrics. But there's an inherent risk: focusing too heavily on optimising these metrics can cause a model to "overfit" to metric specifics, exploiting quirks in training data rather than capturing meaningful generalisable patterns. This pitfall aligns with <strong>Goodhart's Law</strong>, which states:</p><div class="pullquote"><p><em><strong>"When a metric becomes the target, it stops being a good measure</strong></em>."</p></div><p>Given these limitations, true confidence is forged not in a single score but in the rigour of the validation process itself. This robust approach begins with precise planning, which involves defining the scope of validation&#8212;from the data to the final inference pipeline&#8212;translating business objectives into explicit, measurable assessments and curating the immutable test sets needed for evaluation. This blueprint is then operationalized within a repeatable validation pipeline to ensure the same standards judge every model. For high-stakes applications, the process is fortified by independent review teams to eliminate confirmation bias, transforming validation from a one-time gate into an iterative cycle of continuously raising the bar for system safety and effectiveness.</p><h1><strong>Key Takeaways</strong></h1><ul><li><p>Validation is the entire pre-deployment assurance umbrella (testing and verification), while monitoring is strictly a post-deployment task.</p></li><li><p>Design test data for representativeness, broad coverage (including edge cases), explicit failure probes, and airtight isolation.</p></li><li><p>Evaluation metrics are proxies, not goals. Goodhart&#8217;s Law reminds us that over-optimising a single metric corrodes real-world usefulness; confidence comes from a well-governed, repeatable validation pipeline, not a solitary performance score.</p></li><li><p>Choose the right split strategy to manage bias/variance trade-offs; pick deliberately rather than by habit.</p></li></ul><div class="pullquote"><p>COMING UP&#8230; In the next article of this series, we'll explore ML Benchmarking.</p></div><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient X Design! Subscribe for free to receive new posts.</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><p></p><h1><strong>Further reading</strong></h1><ul><li><p><a href="https://neptune.ai/blog/performance-metrics-in-machine-learning-complete-guide">Performance Metrics in Machine Learning [Complete Guide]</a></p></li><li><p><a href="https://neptune.ai/blog/cross-validation-in-machine-learning-how-to-do-it-right">Cross-Validation in Machine Learning: How to Do It Right</a></p></li><li><p><a href="https://scikit-learn.org/stable/modules/cross_validation.html">Cross-validation: evaluating estimator performance</a></p></li><li><p><a href="https://machinelearningmastery.com/nested-cross-validation-for-machine-learning-with-python/">Nested Cross-Validation for Machine Learning with Python</a></p></li><li><p><a href="https://datasciencedojo.com/blog/bootstrap-sampling/">Understanding Bootstrap Sampling: A Guide for Data Enthusiasts</a></p></li></ul><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/validation-begins-with-test-design/comments&quot;,&quot;text&quot;:&quot;Leave a comment&quot;,&quot;action&quot;:null,&quot;class&quot;:null}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/validation-begins-with-test-design/comments"><span>Leave a comment</span></a></p><div class="captioned-button-wrap" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/validation-begins-with-test-design?utm_source=substack&utm_medium=email&utm_content=share&action=share&quot;,&quot;text&quot;:&quot;Share&quot;}" data-component-name="CaptionedButtonToDOM"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient X Design! Feel free to share this piece with anyone who might find it useful.</p></div><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/validation-begins-with-test-design?utm_source=substack&utm_medium=email&utm_content=share&action=share&quot;,&quot;text&quot;:&quot;Share&quot;}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/validation-begins-with-test-design?utm_source=substack&utm_medium=email&utm_content=share&action=share"><span>Share</span></a></p></div><p></p>]]></content:encoded></item><item><title><![CDATA[Safety at Scale: High-Reliability ML Round-up, Jan–Jun 2025]]></title><description><![CDATA[A round-up of key developments in AI regulation, aviation, and finance]]></description><link>https://resilient.safeintelligence.ai/p/safety-at-scale-high-reliability</link><guid isPermaLink="false">https://resilient.safeintelligence.ai/p/safety-at-scale-high-reliability</guid><dc:creator><![CDATA[Brain Aboze]]></dc:creator><pubDate>Mon, 14 Jul 2025 07:42:05 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!8Zrx!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed64fe0e-cc62-459e-806e-0be2ac07592d_1080x693.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!8Zrx!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed64fe0e-cc62-459e-806e-0be2ac07592d_1080x693.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!8Zrx!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed64fe0e-cc62-459e-806e-0be2ac07592d_1080x693.png 424w, https://substackcdn.com/image/fetch/$s_!8Zrx!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed64fe0e-cc62-459e-806e-0be2ac07592d_1080x693.png 848w, https://substackcdn.com/image/fetch/$s_!8Zrx!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed64fe0e-cc62-459e-806e-0be2ac07592d_1080x693.png 1272w, https://substackcdn.com/image/fetch/$s_!8Zrx!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed64fe0e-cc62-459e-806e-0be2ac07592d_1080x693.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!8Zrx!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed64fe0e-cc62-459e-806e-0be2ac07592d_1080x693.png" width="1080" height="693" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/ed64fe0e-cc62-459e-806e-0be2ac07592d_1080x693.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:693,&quot;width&quot;:1080,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:954986,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:&quot;https://resilient.safeintelligence.ai/i/167809866?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fbf60612d-bac3-4a48-aed7-a067982e4747_1080x1080.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!8Zrx!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed64fe0e-cc62-459e-806e-0be2ac07592d_1080x693.png 424w, https://substackcdn.com/image/fetch/$s_!8Zrx!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed64fe0e-cc62-459e-806e-0be2ac07592d_1080x693.png 848w, https://substackcdn.com/image/fetch/$s_!8Zrx!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed64fe0e-cc62-459e-806e-0be2ac07592d_1080x693.png 1272w, https://substackcdn.com/image/fetch/$s_!8Zrx!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fed64fe0e-cc62-459e-806e-0be2ac07592d_1080x693.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><h1>Executive Summary</h1><p>During the first half of 2025, machine learning governance transitioned from principle to practice. Regulators translated guidance into enforceable rules, while organisations strengthened validation processes, formalised risk thresholds, and expanded transparency. In aviation and financial services, two of the most tightly regulated, safety-critical domains, ML systems have demonstrated measurable performance gains and reflect an industry that is striving for ever-greater reliability.</p><div><hr></div><h1>Operationalising ML Risk Management</h1><p>Regulators moved decisively in the first half of 2025, turning guidelines into enforceable obligations and accelerating the maturity curve for responsible AI programmes. Some global regulators&#8217; highlights include:</p><ul><li><p><strong>EU AI Act (Europe)</strong> &#8211; First provisions in force (Feb&#8239;2) banning unacceptable&#8209;risk AI and mandating AI literacy; full high&#8209;risk obligations commence Aug&#8239;2&#8239;2025, covering data quality, documentation, risk &amp; quality management, and EU database registration. <strong><a href="https://digital-strategy.ec.europa.eu/en/policies/regulatory-framework-ai">Read analysis &#8594;</a></strong></p></li><li><p><strong>Singapore Consensus on Global AI Safety (2025)</strong> &#8211; Introduces a defence&#8209;in&#8209;depth safety model spanning safe <strong>development</strong>, rigorous <strong>assessment</strong> (verification &amp; validation), and ongoing <strong>control</strong>. <strong><a href="https://arxiv.org/pdf/2506.20702">Read analysis &#8594;</a></strong></p></li><li><p><strong>United States</strong> &#8211; The NIST AI Risk Management Framework became the de facto national benchmark while states advanced their own laws. <strong><a href="https://www.wiz.io/academy/nist-ai-risk-management-framework">Read analysis &#8594;</a></strong>; <strong>Texas</strong> led with the <em>Responsible AI Governance Act (TRAIGA)</em>, the first statute to ban certain high&#8209;risk uses and launch an AI regulatory sandbox, as over a dozen other states draft similar bills. <strong><a href="https://www.dlapiper.com/en-ca/insights/publications/2025/06/texas-adopts-the-responsible-ai-governance-act">Read analysis &#8594;</a></strong></p></li><li><p><strong>Japan</strong> &#8211; Parliament passed an innovation-first AI law establishing a cabinet-level AI Strategy HQ and voluntary guidelines to attract talent and investment while promoting responsible development. <strong><a href="https://iapp.org/news/a/japan-passes-innovation-focused-ai-governance-bill">Read analysis &#8594;</a></strong></p></li><li><p><strong>Kenya &amp; Wider Africa</strong> &#8211; Kenya&#8217;s National AI Strategy 2025&#8209;30 and similar initiatives across Africa combine ethical, inclusive, and innovation&#8209;centric pillars to foster fintech&#8209;driven growth. <strong><a href="https://ict.go.ke/node/641">Read analysis &#8594;</a></strong></p></li><li><p><strong>Gulf Cooperation Council (GCC)</strong> &#8211; Adopted &#8220;soft&#8209;regulation&#8221; playbooks built on national AI visions and ethical charters, enabling fast innovation while binding enforcement remains light. <strong><a href="https://arxiv.org/pdf/2505.02174">Read analysis &#8594;</a></strong></p></li></ul><h1>Sector Spotlight in High-Stakes Domains</h1><h2><strong>Aviation: Ensuring Safety in the Skies</strong></h2><p>The aviation sector has effectively balanced innovation and safety by utilising advanced machine learning (ML) applications.</p><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/YLvYR/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/30ed8b6e-b67e-453e-b609-3b5f96e6f2f0_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:697,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/YLvYR/1/" width="730" height="697" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><h2>Finance: Risk&#8209;Proofing Decisions</h2><p>Financial institutions harnessed ML to enhance decision-making, strengthen compliance, and streamline operations.</p><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/QZzLE/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/91264110-2384-4d6a-85c7-7ef4e4549238_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:556,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/QZzLE/1/" width="730" height="556" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><h1>Events and Conferences</h1><p>The first half of 2025 featured numerous conferences and workshops focusing on trustworthy, safe and reliable ML. Here is a recap.</p><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/Svm7k/2/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/d968dc13-a8c8-40dc-a278-a6b6e515e26f_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:887,&quot;title&quot;:&quot;| Created with Datawrapper&quot;,&quot;description&quot;:&quot;Create interactive, responsive &amp; beautiful charts &#8212; no code required.&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/Svm7k/2/" width="730" height="887" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><h1><strong>Looking Ahead (H2 2025 &#8594; 2026)</strong></h1><p>In the second half of 2025, we are excited to launch our new publication, <strong>Resilient by Design. </strong>This space is dedicated to exploring the art and science of building robust machine learning systems. You can expect technical insights, real-world use studies and community and collaboration. Subscribe for free, receive insights directly in your inbox, and become part of a growing community committed to making machine learning validated, reliable, repeatable, and robust.</p><p><strong>Pull up a seat and join the conversation:</strong> <a href="https://resilient.safeintelligence.ai/">Resilient by Design &#8594;</a></p><div class="pullquote"><p>Stay safe.</p></div><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient By Design! Subscribe for free to receive new posts.</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><div><hr></div><h1><strong>References</strong></h1><ol><li><p><a href="https://www.aerotime.aero/articles/how-aviation-professionals-can-stay-competitive-in-2025">AeroTime: How aviation professionals can stay competitive in 2025</a></p></li><li><p><a href="https://www.airbus.com/en/newsroom/stories/2025-04-digital-twins-accelerating-aerospace-innovation-from-design-to-operations">AirBus: Digital Twins: Accelerating aerospace innovation from design to operations</a></p></li><li><p><a href="https://appinventiv.com/blog/digital-twin-in-aerospace/">Appinventiv: How Digital Twin Technology is Transforming Airline Operations and Safety</a></p></li><li><p><a href="https://www.ainvest.com/news/joby-aviation-dubai-milestone-catalyst-urban-air-mobility-mainstream-adoption-2507/">AInvest: Joby Aviation's Dubai Milestone: A Catalyst for Urban Air Mobility's Mainstream Adoption</a></p></li><li><p><a href="https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0317914&amp;type=printable#:~:text=Despite%20the%20generally%20low%20danger,prediction%20of%20commercial%20aircraft%20accidents.">Machine Learning-Based Anomaly Detection in Commercial Aircraft</a></p></li><li><p><a href="https://resolve.cambridge.org/core/journals/aeronautical-journal/article/applying-artificial-neural-networks-for-multidimensional-anomaly-detection-based-on-flight-data-monitoring-during-final-approaches/1CA88E673B681A3C5A7571A6767E9BCF?utm_source=chatgpt.com">Applying artificial neural networks for multidimensional anomaly detection based on flight data monitoring during final approaches</a></p></li><li><p><a href="https://www.caa.co.uk/media/yfclgu2m/cap3064a-v1-3-2.pdf">The CAA&#8217;s Response to Emerging AI-Enabled Automation: Part A: Strategy for Regulating AI and Advanced Automation in Aerospace</a></p></li><li><p><a href="https://www.credolab.com/blog/how-alternative-and-traditional-data-work-better-together">Credolab: How Alternative and Traditional Data Work Better Together</a></p></li><li><p><a href="https://www.fca.org.uk/publication/research-notes/how-ai-role-credit-decisions-explained.pdf">Financial Conduct Authority: Credit where credit is due: how can AI's role in credit decisions be explained?</a></p></li><li><p><a href="https://www.silenteight.com/blog/2025-trends-in-aml-and-financial-crime-compliance-a-data-centric-perspective-and-deep-dive-into-transaction-monitoring">Silent Eight 360: 2025 Trends in AML and Financial Crime Compliance: A Data-Centric Perspective and Deep Dive into Transaction Monitoring</a></p></li><li><p><a href="https://journalwjarr.com/sites/default/files/fulltext_pdf/WJARR-2025-1099.pdf">WJARR: Federated learning for privacy-preserving data analytics in mobile applications</a></p></li><li><p><a href="https://cloud.google.com/blog/products/identity-security/google-cloud-and-swift-pioneer-advanced-ai-and-federated-learning-tech">Google Cloud: Google Cloud and Swift pioneer advanced AI and federated learning tech to help combat payments fraud</a></p></li><li><p><a href="https://www.bankofengland.co.uk/financial-stability-in-focus/2025/april-2025">Bank of England: Financial Stability in Focus: Artificial intelligence in the financial system</a></p></li><li><p><a href="https://liquidityfinder.com/insight/technology/ai-for-trading-2025-complete-guide">LiquidityFinder: AI for Trading: The 2025 Complete Guide</a></p></li></ol><div><hr></div><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/safety-at-scale-high-reliability/comments&quot;,&quot;text&quot;:&quot;Leave a comment&quot;,&quot;action&quot;:null,&quot;class&quot;:null}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/safety-at-scale-high-reliability/comments"><span>Leave a comment</span></a></p><div class="captioned-button-wrap" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/safety-at-scale-high-reliability?utm_source=substack&utm_medium=email&utm_content=share&action=share&quot;,&quot;text&quot;:&quot;Share&quot;}" data-component-name="CaptionedButtonToDOM"><div class="preamble"><p class="cta-caption">Thanks for reading Resilient By Design! This post is public so feel free to share it.</p></div><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/safety-at-scale-high-reliability?utm_source=substack&utm_medium=email&utm_content=share&action=share&quot;,&quot;text&quot;:&quot;Share&quot;}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/safety-at-scale-high-reliability?utm_source=substack&utm_medium=email&utm_content=share&action=share"><span>Share</span></a></p></div>]]></content:encoded></item><item><title><![CDATA[ML Testing Refresher]]></title><description><![CDATA[Why Testing ML Feels Like Geology, Not Geometry]]></description><link>https://resilient.safeintelligence.ai/p/ml-testing-refresher-ae9</link><guid isPermaLink="false">https://resilient.safeintelligence.ai/p/ml-testing-refresher-ae9</guid><dc:creator><![CDATA[Brain Aboze]]></dc:creator><pubDate>Thu, 03 Jul 2025 08:45:22 GMT</pubDate><enclosure url="https://substackcdn.com/image/fetch/$s_!v8UI!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png" length="0" type="image/jpeg"/><content:encoded><![CDATA[<h2>Table of Contents</h2><ul><li><p><a href="https://safeintelai.substack.com/i/165627475/tldr">TL;DR</a></p></li><li><p><a href="https://safeintelai.substack.com/i/165627475/why-ml-system-testing-is-different-the-skyscraper-vs-rock-face-analogy">Why ML System Testing is Different: The Skyscraper vs. Rock Face Analogy</a></p></li><li><p><a href="https://safeintelai.substack.com/i/165627475/a-deeper-look-into-ml-model-testing">A Deeper Look into ML Model Testing</a></p></li><li><p><a href="https://safeintelai.substack.com/i/165627475/ml-model-testing-across-ml-paradigms">ML Model Testing Across ML Paradigms</a></p><ul><li><p><a href="https://safeintelai.substack.com/i/165627475/supervised-learning">Supervised Learning</a></p></li><li><p><a href="https://safeintelai.substack.com/i/165627475/unsupervised-learning">Unsupervised Learning</a></p></li><li><p><a href="https://safeintelai.substack.com/i/165627475/reinforcement-learning">Reinforcement Learning</a></p></li></ul></li><li><p><a href="https://safeintelai.substack.com/i/165627475/takeaways">Key Takeaways</a></p></li><li><p><a href="https://safeintelai.substack.com/i/165627475/resources-and-further-reading">Resources &amp; Further Reading</a></p></li></ul><h2><strong>TL;DR</strong></h2><blockquote><p><em>Testing machine learning (ML) systems fundamentally differs from testing traditional code. It requires a multilayered approach that tests three artefacts: code, data, and models. Classic tests, such as unit and integration tests, are combined with specialised data, model, and end-to-end infrastructure tests. Performance metrics alone are insufficient; complementary tests are required. Each ML paradigm (supervised, unsupervised, and reinforcement learning) also needs distinct testing objectives and evaluation techniques.</em></p></blockquote><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!v8UI!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!v8UI!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png 424w, https://substackcdn.com/image/fetch/$s_!v8UI!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png 848w, https://substackcdn.com/image/fetch/$s_!v8UI!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png 1272w, https://substackcdn.com/image/fetch/$s_!v8UI!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!v8UI!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png" width="1456" height="816" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:816,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:2113500,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:true,&quot;internalRedirect&quot;:&quot;https://safeintelai.substack.com/i/165627475?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!v8UI!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png 424w, https://substackcdn.com/image/fetch/$s_!v8UI!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png 848w, https://substackcdn.com/image/fetch/$s_!v8UI!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png 1272w, https://substackcdn.com/image/fetch/$s_!v8UI!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F7c026421-1924-4590-b118-eedcb519d2c3_1456x816.png 1456w" sizes="100vw" fetchpriority="high"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a></figure></div><p>Testing ML systems is hard due to their complexity across multiple dimensions&#8212;code, data, and models. Traditional testing methods, such as unit tests, integration tests, acceptance tests, system tests (functional and non-functional), regression tests, and practices like test-driven development (TDD), may feel tedious, but they provide a valuable, repeatable framework ensuring software correctness. In contrast, ML model testing primarily delivers performance statistics, which can sometimes feel ambiguous. What does 91% accuracy genuinely mean in practice?</p><h2><strong>Why ML System Testing is Different: The Skyscraper vs. Rock Face Analogy</strong></h2><p>Let's consider a visual analogy to put things into perspective: Imagine testing a glass skyscraper versus a fissured rock face. The skyscraper surface can be confidently verified with precise measurements and well-understood engineering principles. A handful of targeted tests are sufficient to justify your confidence. In contrast, a rock face is far more complex. Composed of possibly igneous, sedimentary, and metamorphic layers shaped by processes like sedimentation, pressure, erosion, heating, and cooling, its surface is inherently irregular and unpredictable. While you can take samples and apply statistical extrapolations, each measurement offers only an approximation, never absolute certainty. ML testing is much like the rock face.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!eu9d!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F514c1d92-1cbc-45a5-a6f4-4f2b2698a1fb_1600x534.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!eu9d!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F514c1d92-1cbc-45a5-a6f4-4f2b2698a1fb_1600x534.png 424w, https://substackcdn.com/image/fetch/$s_!eu9d!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F514c1d92-1cbc-45a5-a6f4-4f2b2698a1fb_1600x534.png 848w, https://substackcdn.com/image/fetch/$s_!eu9d!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F514c1d92-1cbc-45a5-a6f4-4f2b2698a1fb_1600x534.png 1272w, https://substackcdn.com/image/fetch/$s_!eu9d!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F514c1d92-1cbc-45a5-a6f4-4f2b2698a1fb_1600x534.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!eu9d!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F514c1d92-1cbc-45a5-a6f4-4f2b2698a1fb_1600x534.png" width="1456" height="486" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/514c1d92-1cbc-45a5-a6f4-4f2b2698a1fb_1600x534.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:486,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:false,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!eu9d!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F514c1d92-1cbc-45a5-a6f4-4f2b2698a1fb_1600x534.png 424w, https://substackcdn.com/image/fetch/$s_!eu9d!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F514c1d92-1cbc-45a5-a6f4-4f2b2698a1fb_1600x534.png 848w, https://substackcdn.com/image/fetch/$s_!eu9d!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F514c1d92-1cbc-45a5-a6f4-4f2b2698a1fb_1600x534.png 1272w, https://substackcdn.com/image/fetch/$s_!eu9d!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F514c1d92-1cbc-45a5-a6f4-4f2b2698a1fb_1600x534.png 1456w" sizes="100vw"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Glass vs Rock illustration</figcaption></figure></div><p>Traditional software behaves predictably; the same input always yields the same output. ML systems, on the other hand, learn patterns from ever-changing data, creating opaque decision boundaries. A traditional software failure is typically straightforward: a button isn't clickable, or a form doesn't submit. Such deterministic logic makes debugging clear-cut. Conversely, an ML system's slight input variations (tiny perturbations) can trigger significantly different outputs or flip predictions. Let&#8217;s take a look at the comparison testing workflow for traditional software and ML systems below:</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!aV3v!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5b3dd361-a081-4159-949b-e2f732eaf2fb_2788x794.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!aV3v!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5b3dd361-a081-4159-949b-e2f732eaf2fb_2788x794.png 424w, https://substackcdn.com/image/fetch/$s_!aV3v!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5b3dd361-a081-4159-949b-e2f732eaf2fb_2788x794.png 848w, https://substackcdn.com/image/fetch/$s_!aV3v!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5b3dd361-a081-4159-949b-e2f732eaf2fb_2788x794.png 1272w, https://substackcdn.com/image/fetch/$s_!aV3v!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5b3dd361-a081-4159-949b-e2f732eaf2fb_2788x794.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!aV3v!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5b3dd361-a081-4159-949b-e2f732eaf2fb_2788x794.png" width="1456" height="415" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/5b3dd361-a081-4159-949b-e2f732eaf2fb_2788x794.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:415,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:142963,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://safeintelai.substack.com/i/165627475?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5b3dd361-a081-4159-949b-e2f732eaf2fb_2788x794.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!aV3v!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5b3dd361-a081-4159-949b-e2f732eaf2fb_2788x794.png 424w, https://substackcdn.com/image/fetch/$s_!aV3v!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5b3dd361-a081-4159-949b-e2f732eaf2fb_2788x794.png 848w, https://substackcdn.com/image/fetch/$s_!aV3v!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5b3dd361-a081-4159-949b-e2f732eaf2fb_2788x794.png 1272w, https://substackcdn.com/image/fetch/$s_!aV3v!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5b3dd361-a081-4159-949b-e2f732eaf2fb_2788x794.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Traditional system testing VS ML system testing, <a href="http://source">Google</a></figcaption></figure></div><p></p><p>Here&#8217;s a comprehensive overview that clearly differentiates the types of testing:</p><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/EZP0u/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/1eb252d6-181d-4f42-a386-15037c4e75b9_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:872,&quot;title&quot;:&quot;Types of Testing Differentiation&quot;,&quot;description&quot;:&quot;&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/EZP0u/1/" width="730" height="872" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><p>The real cost of machine learning failures extends far beyond mere technical glitches, manifesting as significant financial losses, reputational damage, and even safety hazards. Consider<a href="https://www.nytimes.com/2021/11/02/business/zillow-q3-earnings-home-flipping-ibuying.html"> </a>the recent<a href="https://www.reuters.com/business/autos-transportation/amazons-zoox-issues-second-software-recall-this-month-after-san-francisco-crash-2025-05-29/"> Zoox Robotaxi</a> recalls, which highlighted the critical need for comprehensive testing; a crash in San Francisco revealed that insufficient pedestrian-detection tests forecasts forced Amazon's robotaxi unit to recall over 270 vehicles. Similarly, consider <a href="https://www.nytimes.com/2021/11/02/business/zillow-q3-earnings-home-flipping-ibuying.html">Zillow's iBuying collapse</a>, where its ML models consistently overestimated home prices, resulting in a $420 million loss and the entire program being shut down. These examples underscore that rigorous, multifaceted testing is a good practice and an indispensable safeguard against catastrophic operational and financial repercussions.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!flvu!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17f7e1f1-467d-40e0-ac21-54b038ab3a23_1600x534.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!flvu!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17f7e1f1-467d-40e0-ac21-54b038ab3a23_1600x534.png 424w, https://substackcdn.com/image/fetch/$s_!flvu!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17f7e1f1-467d-40e0-ac21-54b038ab3a23_1600x534.png 848w, https://substackcdn.com/image/fetch/$s_!flvu!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17f7e1f1-467d-40e0-ac21-54b038ab3a23_1600x534.png 1272w, https://substackcdn.com/image/fetch/$s_!flvu!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17f7e1f1-467d-40e0-ac21-54b038ab3a23_1600x534.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!flvu!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17f7e1f1-467d-40e0-ac21-54b038ab3a23_1600x534.png" width="1456" height="486" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/17f7e1f1-467d-40e0-ac21-54b038ab3a23_1600x534.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:486,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:null,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:null,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!flvu!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17f7e1f1-467d-40e0-ac21-54b038ab3a23_1600x534.png 424w, https://substackcdn.com/image/fetch/$s_!flvu!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17f7e1f1-467d-40e0-ac21-54b038ab3a23_1600x534.png 848w, https://substackcdn.com/image/fetch/$s_!flvu!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17f7e1f1-467d-40e0-ac21-54b038ab3a23_1600x534.png 1272w, https://substackcdn.com/image/fetch/$s_!flvu!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F17f7e1f1-467d-40e0-ac21-54b038ab3a23_1600x534.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Cost of machine learning failure illustration</figcaption></figure></div><h2><strong>A Deeper Look into ML Model Testing</strong></h2><p>Ensuring that ML models behave as intended&#8212;consistent and stable&#8212;is crucial, especially when they power critical real-world applications. A model's behaviour can vary significantly based on code, data quality, model architecture, training procedures, production environments, and even adversarial scenarios. The primary goal of ML system testing is to build confidence that the model will function correctly, robustly, and predictably under diverse, uncertain, and complex real-world conditions.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!1mg-!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1881abed-7219-40cb-bf41-9c346a9c9512_1600x895.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!1mg-!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1881abed-7219-40cb-bf41-9c346a9c9512_1600x895.png 424w, https://substackcdn.com/image/fetch/$s_!1mg-!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1881abed-7219-40cb-bf41-9c346a9c9512_1600x895.png 848w, https://substackcdn.com/image/fetch/$s_!1mg-!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1881abed-7219-40cb-bf41-9c346a9c9512_1600x895.png 1272w, https://substackcdn.com/image/fetch/$s_!1mg-!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1881abed-7219-40cb-bf41-9c346a9c9512_1600x895.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!1mg-!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1881abed-7219-40cb-bf41-9c346a9c9512_1600x895.png" width="1600" height="895" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/1881abed-7219-40cb-bf41-9c346a9c9512_1600x895.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:895,&quot;width&quot;:1600,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:121924,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:null,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!1mg-!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1881abed-7219-40cb-bf41-9c346a9c9512_1600x895.png 424w, https://substackcdn.com/image/fetch/$s_!1mg-!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1881abed-7219-40cb-bf41-9c346a9c9512_1600x895.png 848w, https://substackcdn.com/image/fetch/$s_!1mg-!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1881abed-7219-40cb-bf41-9c346a9c9512_1600x895.png 1272w, https://substackcdn.com/image/fetch/$s_!1mg-!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F1881abed-7219-40cb-bf41-9c346a9c9512_1600x895.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Multi-layered ML system testing approach</figcaption></figure></div><p>It is best practice to employ a multi-layered approach to thoroughly test our ML systems, which often includes:</p><ul><li><p>Data tests, which include, but are not limited to, schema validation, data quality assessments, statistical distribution checks, and outlier and drift detection.</p></li><li><p>Model testing spans two critical dimensions: performance and behaviour. Performance testing evaluates how effectively a model meets its objectives through quantitative metrics and standard benchmarks, assessing its generalisation capabilities and detecting overfitting or underfitting. Behaviour testing shifts towards qualitative, scenario-driven assessments, examining how the model operates under challenging or specific conditions. This includes evaluating robustness to noise and adversarial attacks, interpretability (local and global explanations), compliance with safety constraints, and fairness measures like demographic parity and equal opportunity.</p></li><li><p>ML infrastructure and end-to-end testing are crucial for operationalising ML systems. These tests span the entire MLOps lifecycle, including rigorous CI/CD pipeline checks, validating data integrity via feature store assertions, verifying the functionality of model registry and serving endpoints, establishing robust rollback mechanisms, configuring appropriate monitoring and alerting thresholds, and ensuring the correct execution of various deployment strategies.</p></li></ul><h2><strong><a href="https://safeintelai.substack.com/i/165627475/ml-model-testing-across-ml-paradigms">ML Model Testing Across ML Paradigms</a></strong></h2><p>Just as different geological formations would require different assessment techniques, model testing's specifics for performance and behaviour must adapt to the model's learning approach, or its underlying paradigm. The three primary ML paradigms&#8212;supervised, unsupervised, and reinforcement learning&#8212;each have their difficulties and necessitate distinct testing approaches. Here's a table summarising the primary ML paradigms and their corresponding test objectives:</p><div id="datawrapper-iframe" class="datawrapper-wrap outer" data-attrs="{&quot;url&quot;:&quot;https://datawrapper.dwcdn.net/bLFi1/1/&quot;,&quot;thumbnail_url&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/e8fe3832-ab2e-4c25-ac25-e14321049bfe_1260x660.png&quot;,&quot;thumbnail_url_full&quot;:&quot;&quot;,&quot;height&quot;:359,&quot;title&quot;:&quot;ML Model Testing Across ML Paradigms&quot;,&quot;description&quot;:&quot;&quot;}" data-component-name="DatawrapperToDOM"><iframe id="iframe-datawrapper" class="datawrapper-iframe" src="https://datawrapper.dwcdn.net/bLFi1/1/" width="730" height="359" frameborder="0" scrolling="no"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(e){if(void 0!==e.data["datawrapper-height"]){var t=document.querySelectorAll("iframe");for(var a in e.data["datawrapper-height"])for(var r=0;r<t.length;r++){if(t[r].contentWindow===e.source)t[r].style.height=e.data["datawrapper-height"][a]+"px"}}}))}();</script></div><p>You can read more about these paradigms on the <a href="https://www.wolfram.com/language/introduction-machine-learning/machine-learning-paradigms/#:~:text=Machine%20learning%20is%20commonly%20separated,is%20presented%20to%20the%20computer.">Wolfram blog</a>, but here&#8217;s a quick overview is as follows:</p><h3><strong>Supervised Learning</strong></h3><p>Supervised learning is characterised by labelled input/output pairs; the main goal is to build a model that generalises well to unseen data. A good approach to testing is splitting the data into a training set, a validation set, and a test set. The training set is used to train the model, the validation set is used for hyperparameter tuning and model selection, and the test set is reserved for the final offline evaluation. Common splits include 70:15:15 (train:validation:test) or 70:30 (train:test). Cross-validation is a common technique that minimises the loss of training data for the validation set while still ensuring a rich test set by iteratively training and validating different subsets (folds).</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!wUV4!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F66f8c5b2-1230-4188-ae29-49dfadf60e06_1575x1053.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!wUV4!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F66f8c5b2-1230-4188-ae29-49dfadf60e06_1575x1053.png 424w, https://substackcdn.com/image/fetch/$s_!wUV4!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F66f8c5b2-1230-4188-ae29-49dfadf60e06_1575x1053.png 848w, https://substackcdn.com/image/fetch/$s_!wUV4!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F66f8c5b2-1230-4188-ae29-49dfadf60e06_1575x1053.png 1272w, https://substackcdn.com/image/fetch/$s_!wUV4!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F66f8c5b2-1230-4188-ae29-49dfadf60e06_1575x1053.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!wUV4!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F66f8c5b2-1230-4188-ae29-49dfadf60e06_1575x1053.png" width="1575" height="1053" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/66f8c5b2-1230-4188-ae29-49dfadf60e06_1575x1053.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:1053,&quot;width&quot;:1575,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:107624,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://safeintelai.substack.com/i/165627475?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fdb82f5ba-27e1-4a97-828f-2c16156263dd_1575x1182.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!wUV4!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F66f8c5b2-1230-4188-ae29-49dfadf60e06_1575x1053.png 424w, https://substackcdn.com/image/fetch/$s_!wUV4!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F66f8c5b2-1230-4188-ae29-49dfadf60e06_1575x1053.png 848w, https://substackcdn.com/image/fetch/$s_!wUV4!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F66f8c5b2-1230-4188-ae29-49dfadf60e06_1575x1053.png 1272w, https://substackcdn.com/image/fetch/$s_!wUV4!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F66f8c5b2-1230-4188-ae29-49dfadf60e06_1575x1053.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Data splitting for supervised learning overview</figcaption></figure></div><p></p><h3><strong>Unsupervised Learning</strong></h3><p>Unsupervised learning aims to discover hidden patterns or groupings without labelled outputs, so there are no explicit splits of the dataset like we see in supervised learning. Instead, it makes use of intrinsic evaluation for clustering (like the silhouette coefficient, Davies-Bouldin index, inertia, the sum of squared distances, and the Calinski-Harabasz index) and dimensionality reduction (like reconstruction error and explained variance ratio). Additionally, it incorporates extrinsic evaluation methods, such as visualisations and domain expert evaluations, which are critical due to the lack of ground truth.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!6oSF!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F24ba8b7e-4a05-46b7-a91e-0d3549e58391_1488x610.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!6oSF!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F24ba8b7e-4a05-46b7-a91e-0d3549e58391_1488x610.png 424w, https://substackcdn.com/image/fetch/$s_!6oSF!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F24ba8b7e-4a05-46b7-a91e-0d3549e58391_1488x610.png 848w, https://substackcdn.com/image/fetch/$s_!6oSF!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F24ba8b7e-4a05-46b7-a91e-0d3549e58391_1488x610.png 1272w, https://substackcdn.com/image/fetch/$s_!6oSF!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F24ba8b7e-4a05-46b7-a91e-0d3549e58391_1488x610.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!6oSF!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F24ba8b7e-4a05-46b7-a91e-0d3549e58391_1488x610.png" width="1456" height="597" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/24ba8b7e-4a05-46b7-a91e-0d3549e58391_1488x610.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:597,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:56143,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://safeintelai.substack.com/i/165627475?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F24ba8b7e-4a05-46b7-a91e-0d3549e58391_1488x610.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!6oSF!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F24ba8b7e-4a05-46b7-a91e-0d3549e58391_1488x610.png 424w, https://substackcdn.com/image/fetch/$s_!6oSF!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F24ba8b7e-4a05-46b7-a91e-0d3549e58391_1488x610.png 848w, https://substackcdn.com/image/fetch/$s_!6oSF!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F24ba8b7e-4a05-46b7-a91e-0d3549e58391_1488x610.png 1272w, https://substackcdn.com/image/fetch/$s_!6oSF!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F24ba8b7e-4a05-46b7-a91e-0d3549e58391_1488x610.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Unsupervised learning overview</figcaption></figure></div><h2><strong>Reinforcement Learning</strong></h2><p>Lastly, in reinforcement learning, an agent interacts with an environment and learns a policy (a decision-making strategy) through repeated trials, aiming to maximise cumulative rewards and minimise penalties (negative rewards). Testing involves evaluating the learnt or trained policy in another controlled environment or set of episodes.</p><div class="captioned-image-container"><figure><a class="image-link image2 is-viewable-img" target="_blank" href="https://substackcdn.com/image/fetch/$s_!xb9b!,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5d1dd92c-92dc-4848-88f0-d4a0b9a15cd6_1488x610.png" data-component-name="Image2ToDOM"><div class="image2-inset"><picture><source type="image/webp" srcset="https://substackcdn.com/image/fetch/$s_!xb9b!,w_424,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5d1dd92c-92dc-4848-88f0-d4a0b9a15cd6_1488x610.png 424w, https://substackcdn.com/image/fetch/$s_!xb9b!,w_848,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5d1dd92c-92dc-4848-88f0-d4a0b9a15cd6_1488x610.png 848w, https://substackcdn.com/image/fetch/$s_!xb9b!,w_1272,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5d1dd92c-92dc-4848-88f0-d4a0b9a15cd6_1488x610.png 1272w, https://substackcdn.com/image/fetch/$s_!xb9b!,w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5d1dd92c-92dc-4848-88f0-d4a0b9a15cd6_1488x610.png 1456w" sizes="100vw"><img src="https://substackcdn.com/image/fetch/$s_!xb9b!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5d1dd92c-92dc-4848-88f0-d4a0b9a15cd6_1488x610.png" width="1456" height="597" data-attrs="{&quot;src&quot;:&quot;https://substack-post-media.s3.amazonaws.com/public/images/5d1dd92c-92dc-4848-88f0-d4a0b9a15cd6_1488x610.png&quot;,&quot;srcNoWatermark&quot;:null,&quot;fullscreen&quot;:null,&quot;imageSize&quot;:null,&quot;height&quot;:597,&quot;width&quot;:1456,&quot;resizeWidth&quot;:null,&quot;bytes&quot;:39821,&quot;alt&quot;:null,&quot;title&quot;:null,&quot;type&quot;:&quot;image/png&quot;,&quot;href&quot;:null,&quot;belowTheFold&quot;:true,&quot;topImage&quot;:false,&quot;internalRedirect&quot;:&quot;https://safeintelai.substack.com/i/165627475?img=https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5d1dd92c-92dc-4848-88f0-d4a0b9a15cd6_1488x610.png&quot;,&quot;isProcessing&quot;:false,&quot;align&quot;:null,&quot;offset&quot;:false}" class="sizing-normal" alt="" srcset="https://substackcdn.com/image/fetch/$s_!xb9b!,w_424,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5d1dd92c-92dc-4848-88f0-d4a0b9a15cd6_1488x610.png 424w, https://substackcdn.com/image/fetch/$s_!xb9b!,w_848,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5d1dd92c-92dc-4848-88f0-d4a0b9a15cd6_1488x610.png 848w, https://substackcdn.com/image/fetch/$s_!xb9b!,w_1272,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5d1dd92c-92dc-4848-88f0-d4a0b9a15cd6_1488x610.png 1272w, https://substackcdn.com/image/fetch/$s_!xb9b!,w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2F5d1dd92c-92dc-4848-88f0-d4a0b9a15cd6_1488x610.png 1456w" sizes="100vw" loading="lazy"></picture><div class="image-link-expand"><div class="pencraft pc-display-flex pc-gap-8 pc-reset"><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container restack-image"><svg role="img" width="20" height="20" viewBox="0 0 20 20" fill="none" stroke-width="1.5" stroke="var(--color-fg-primary)" stroke-linecap="round" stroke-linejoin="round" xmlns="http://www.w3.org/2000/svg"><g><title></title><path d="M2.53001 7.81595C3.49179 4.73911 6.43281 2.5 9.91173 2.5C13.1684 2.5 15.9537 4.46214 17.0852 7.23684L17.6179 8.67647M17.6179 8.67647L18.5002 4.26471M17.6179 8.67647L13.6473 6.91176M17.4995 12.1841C16.5378 15.2609 13.5967 17.5 10.1178 17.5C6.86118 17.5 4.07589 15.5379 2.94432 12.7632L2.41165 11.3235M2.41165 11.3235L1.5293 15.7353M2.41165 11.3235L6.38224 13.0882"></path></g></svg></button><button tabindex="0" type="button" class="pencraft pc-reset pencraft icon-container view-image"><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-maximize2 lucide-maximize-2"><polyline points="15 3 21 3 21 9"></polyline><polyline points="9 21 3 21 3 15"></polyline><line x1="21" x2="14" y1="3" y2="10"></line><line x1="3" x2="10" y1="21" y2="14"></line></svg></button></div></div></div></a><figcaption class="image-caption">Reinforcement learning overview</figcaption></figure></div><h2><strong>Key Takeaways</strong></h2><p>Without systematic tests, we have no insight into how a model behaves. Yet the tests we <em>can</em> run look different from classic software checks: they rely on statistical sampling and extrapolation rather than a tiny set of fully representative inputs. We're only just getting started with this series, but already, we're getting a hint that building ML systems and testing whether they will function well in the real world is a big challenge. This article offers a few key takeaways:</p><ul><li><p>Ultimately, effective testing supports ML systems' safe, effective, and responsible deployment. This is crucial for avoiding costly failures and building user trust.</p></li><li><p>Different types of testing are essential, from data and model tests to infrastructure and end-to-end tests, ensuring comprehensive coverage of the ML system's components.</p></li><li><p>ML model testing involves a blend of quantitative and qualitative model assessment. This means going beyond accuracy scores to understand model behaviour, such as robustness and explainability in real-world scenarios.</p></li><li><p>We've seen how testing strategies vary for the respective ML paradigms (supervised, unsupervised, and reinforcement learning), each demanding specific objectives and evaluation techniques.</p></li><li><p>The need for rigorous and continuous testing is paramount. Failure in ML is expensive; real-world incidents prove that weak tests translate directly into significant financial loss, reputational damage, and even safety hazards.</p></li></ul><div class="pullquote"><p><strong>COMING UP&#8230; </strong>In the next article of this series, we'll explore how ML validation begins with test design.</p></div><p></p><div class="subscription-widget-wrap-editor" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/subscribe?&quot;,&quot;text&quot;:&quot;Subscribe&quot;,&quot;language&quot;:&quot;en&quot;}" data-component-name="SubscribeWidgetToDOM"><div class="subscription-widget show-subscribe"><div class="preamble"><p class="cta-caption">Thanks for reading! Subscribe for free to receive new posts.</p></div><form class="subscription-widget-subscribe"><input type="email" class="email-input" name="email" placeholder="Type your email&#8230;" tabindex="-1"><input type="submit" class="button primary" value="Subscribe"><div class="fake-input-wrapper"><div class="fake-input"></div><div class="fake-button"></div></div></form></div></div><h2><strong>Resources &amp; Further Reading</strong></h2><p>For further reading, see the links below, which are some of the best blog posts on testing for machine learning. </p><ul><li><p><a href="https://research.google.com/pubs/archive/aad9f93b86b7addfea4c419b9100c6cdd26cacea.pdf">Google: The ML Test Score: A Rubric for ML Production Readiness and Technical Debt Reduction</a></p></li><li><p><a href="https://medium.com/data-science/how-to-test-machine-learning-systems-d53623d32797">How to Test Machine Learning Systems</a></p></li><li><p><a href="https://madewithml.com/courses/mlops/testing/">Testing Machine Learning Systems: Code, Data and Models</a></p></li><li><p><a href="https://arxiv.org/abs/1412.6572">Explaining and Harnessing Adversarial Examples</a></p></li><li><p><a href="https://madewithml.com/courses/mlops/testing/">Made With ML by Anyscale: Testing Machine Learning Systems: Code, Data and Models</a></p></li><li><p><a href="https://neptune.ai/blog/automated-testing-machine-learning">Neptune AI: Automated Testing in Machine Learning Projects [ABest Practices for MLOps]</a></p></li><li><p><a href="https://machinelearningmastery.com/train-test-split-for-evaluating-machine-learning-algorithms/">Train-Test Split for Evaluating Machine Learning Algorithms</a></p></li><li><p><a href="https://www.kdnuggets.com/2023/04/exploring-unsupervised-learning-metrics.html">KDNuggets: Exploring Unsupervised Learning Metrics</a></p></li><li><p><a href="https://www.youtube.com/watch?v=w33Lplx49_A">CS188 Artificial Intelligence, Fall 2013. Lecture 10: Reinforcement Learning</a></p></li></ul><div><hr></div><p></p><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/ml-testing-refresher-ae9/comments&quot;,&quot;text&quot;:&quot;Leave a comment&quot;,&quot;action&quot;:null,&quot;class&quot;:null}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/ml-testing-refresher-ae9/comments"><span>Leave a comment</span></a></p><div class="captioned-button-wrap" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/ml-testing-refresher-ae9?utm_source=substack&utm_medium=email&utm_content=share&action=share&quot;,&quot;text&quot;:&quot;Share&quot;}" data-component-name="CaptionedButtonToDOM"><div class="preamble"><p class="cta-caption">Feel free to share this piece with anyone who might find it useful.</p></div><p class="button-wrapper" data-attrs="{&quot;url&quot;:&quot;https://resilient.safeintelligence.ai/p/ml-testing-refresher-ae9?utm_source=substack&utm_medium=email&utm_content=share&action=share&quot;,&quot;text&quot;:&quot;Share&quot;}" data-component-name="ButtonCreateButton"><a class="button primary" href="https://resilient.safeintelligence.ai/p/ml-testing-refresher-ae9?utm_source=substack&utm_medium=email&utm_content=share&action=share"><span>Share</span></a></p></div><p></p>]]></content:encoded></item></channel></rss>