πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

AI Testing

🟒 Free Lesson

Advertisement

AI Testing

Test CasesStandard InputsEdge CasesFunctionalCorrectness TestsAdversarialAttack DetectionRegressionPerformance BaselineAI ModelLLM / ClassifierUnder TestVersion: v2.1Status: TestingAccuracy: 94%Adversarial: 82%Latency: 120msTest ReportPass/Fail SummaryRecommendations

AI testing encompasses functional testing, adversarial robustness evaluation, regression testing, and comprehensive model quality assessment.

Adversarial Testing

import numpy as np

class AdversarialTester:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def text_fooler_attack(self, text: str, original_label: int,
                           num_perturbations: int = 10) -> list:
        words = text.split()
        successful_attacks = []

        for i in range(min(num_perturbations, len(words))):
            modified_words = words.copy()
            modified_words[i] = self._get_synonym(words[i])
            modified_text = " ".join(modified_words)

            new_label = self._predict(modified_text)
            if new_label != original_label:
                successful_attacks.append({
                    "original": text,
                    "modified": modified_text,
                    "changed_word": i,
                    "original_label": original_label,
                    "new_label": new_label
                })

        return successful_attacks

    def _get_synonym(self, word: str) -> str:
        from nltk.corpus import wordnet
        synonyms = []
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                if lemma.name() != word:
                    synonyms.append(lemma.name())
        return synonyms[0] if synonyms else word + "_modified"

    def _predict(self, text: str) -> int:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
        outputs = self.model(**inputs)
        return outputs.logits.argmax(-1).item()

    def evaluate_robustness(self, test_cases: list) -> dict:
        attacks = []
        for text, label in test_cases:
            result = self.text_fooler_attack(text, label)
            attacks.extend(result)
        return {
            "total_attacks": len(attacks),
            "success_rate": len(attacks) / len(test_cases),
            "successful_attacks": attacks
        }

# Usage
tester = AdversarialTester(model, tokenizer)
results = tester.evaluate_robustness(test_dataset)

Model Quality Tests

class ModelQualityTests:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.results = []

    def test_consistency(self, text: str, num_runs: int = 5) -> dict:
        predictions = []
        for _ in range(num_runs):
            inputs = self.tokenizer(text, return_tensors="pt")
            outputs = self.model(**inputs)
            pred = outputs.logits.argmax(-1).item()
            predictions.append(pred)
        consistency = len(set(predictions)) == 1
        return {"consistent": consistency, "predictions": predictions}

    def test_latency(self, texts: list, max_latency_ms: float = 100) -> dict:
        import time
        latencies = []
        for text in texts:
            start = time.time()
            inputs = self.tokenizer(text, return_tensors="pt")
            self.model(**inputs)
            latency = (time.time() - start) * 1000
            latencies.append(latency)
        return {
            "avg_latency_ms": np.mean(latencies),
            "p95_latency_ms": np.percentile(latencies, 95),
            "meets_sla": np.percentile(latencies, 95) <= max_latency_ms
        }

    def test_fairness(self, texts_by_group: dict) -> dict:
        group_accuracies = {}
        for group, texts in texts_by_group.items():
            correct = 0
            for text, expected in texts:
                inputs = self.tokenizer(text, return_tensors="pt")
                pred = self.model(**inputs).logits.argmax(-1).item()
                if pred == expected:
                    correct += 1
            group_accuracies[group] = correct / len(texts)
        return {
            "group_accuracies": group_accuracies,
            "disparity": max(group_accuracies.values()) - min(group_accuracies.values())
        }

    def run_all_tests(self, test_suite: dict) -> dict:
        results = {}
        results["consistency"] = self.test_consistency(test_suite["sample_text"])
        results["latency"] = self.test_latency(test_suite["latency_texts"])
        results["fairness"] = self.test_fairness(test_suite["fairness_texts"])
        return results

# Usage
tester = ModelQualityTests(model, tokenizer)
results = tester.run_all_tests(test_suite)

Regression Testing

class RegressionTester:
    def __init__(self, baseline_results: dict):
        self.baseline = baseline_results
        self.current_results = {}

    def compare_results(self, current: dict, threshold: float = 0.02) -> dict:
        self.current_results = current
        regressions = []
        improvements = []

        for metric, value in current.items():
            baseline_val = self.baseline.get(metric, 0)
            diff = value - baseline_val
            pct_change = diff / baseline_val if baseline_val != 0 else 0

            if pct_change < -threshold:
                regressions.append({"metric": metric, "change": pct_change})
            elif pct_change > threshold:
                improvements.append({"metric": metric, "change": pct_change})

        return {
            "regressions": regressions,
            "improvements": improvements,
            "passed": len(regressions) == 0
        }

    def save_baseline(self, results: dict, version: str):
        self.baseline = results

# Usage
regression = RegressionTester(baseline_results={"accuracy": 0.94, "f1": 0.92})
comparison = regression.compare_results({"accuracy": 0.935, "f1": 0.925})

Key Takeaways

  • Adversarial testing evaluates model robustness against attacks
  • Quality tests check consistency, latency, and fairness
  • Regression testing catches performance degradation
  • Automated testing enables continuous quality monitoring
  • Test coverage should include edge cases and diverse inputs
⭐

Premium Content

AI Testing

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement