AI Testing
AI testing encompasses functional testing, adversarial robustness evaluation, regression testing, and comprehensive model quality assessment.
Adversarial Testing
import numpy as np
class AdversarialTester:
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def text_fooler_attack(self, text: str, original_label: int,
num_perturbations: int = 10) -> list:
words = text.split()
successful_attacks = []
for i in range(min(num_perturbations, len(words))):
modified_words = words.copy()
modified_words[i] = self._get_synonym(words[i])
modified_text = " ".join(modified_words)
new_label = self._predict(modified_text)
if new_label != original_label:
successful_attacks.append({
"original": text,
"modified": modified_text,
"changed_word": i,
"original_label": original_label,
"new_label": new_label
})
return successful_attacks
def _get_synonym(self, word: str) -> str:
from nltk.corpus import wordnet
synonyms = []
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
if lemma.name() != word:
synonyms.append(lemma.name())
return synonyms[0] if synonyms else word + "_modified"
def _predict(self, text: str) -> int:
inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
outputs = self.model(**inputs)
return outputs.logits.argmax(-1).item()
def evaluate_robustness(self, test_cases: list) -> dict:
attacks = []
for text, label in test_cases:
result = self.text_fooler_attack(text, label)
attacks.extend(result)
return {
"total_attacks": len(attacks),
"success_rate": len(attacks) / len(test_cases),
"successful_attacks": attacks
}
# Usage
tester = AdversarialTester(model, tokenizer)
results = tester.evaluate_robustness(test_dataset)
Model Quality Tests
class ModelQualityTests:
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
self.results = []
def test_consistency(self, text: str, num_runs: int = 5) -> dict:
predictions = []
for _ in range(num_runs):
inputs = self.tokenizer(text, return_tensors="pt")
outputs = self.model(**inputs)
pred = outputs.logits.argmax(-1).item()
predictions.append(pred)
consistency = len(set(predictions)) == 1
return {"consistent": consistency, "predictions": predictions}
def test_latency(self, texts: list, max_latency_ms: float = 100) -> dict:
import time
latencies = []
for text in texts:
start = time.time()
inputs = self.tokenizer(text, return_tensors="pt")
self.model(**inputs)
latency = (time.time() - start) * 1000
latencies.append(latency)
return {
"avg_latency_ms": np.mean(latencies),
"p95_latency_ms": np.percentile(latencies, 95),
"meets_sla": np.percentile(latencies, 95) <= max_latency_ms
}
def test_fairness(self, texts_by_group: dict) -> dict:
group_accuracies = {}
for group, texts in texts_by_group.items():
correct = 0
for text, expected in texts:
inputs = self.tokenizer(text, return_tensors="pt")
pred = self.model(**inputs).logits.argmax(-1).item()
if pred == expected:
correct += 1
group_accuracies[group] = correct / len(texts)
return {
"group_accuracies": group_accuracies,
"disparity": max(group_accuracies.values()) - min(group_accuracies.values())
}
def run_all_tests(self, test_suite: dict) -> dict:
results = {}
results["consistency"] = self.test_consistency(test_suite["sample_text"])
results["latency"] = self.test_latency(test_suite["latency_texts"])
results["fairness"] = self.test_fairness(test_suite["fairness_texts"])
return results
# Usage
tester = ModelQualityTests(model, tokenizer)
results = tester.run_all_tests(test_suite)
Regression Testing
class RegressionTester:
def __init__(self, baseline_results: dict):
self.baseline = baseline_results
self.current_results = {}
def compare_results(self, current: dict, threshold: float = 0.02) -> dict:
self.current_results = current
regressions = []
improvements = []
for metric, value in current.items():
baseline_val = self.baseline.get(metric, 0)
diff = value - baseline_val
pct_change = diff / baseline_val if baseline_val != 0 else 0
if pct_change < -threshold:
regressions.append({"metric": metric, "change": pct_change})
elif pct_change > threshold:
improvements.append({"metric": metric, "change": pct_change})
return {
"regressions": regressions,
"improvements": improvements,
"passed": len(regressions) == 0
}
def save_baseline(self, results: dict, version: str):
self.baseline = results
# Usage
regression = RegressionTester(baseline_results={"accuracy": 0.94, "f1": 0.92})
comparison = regression.compare_results({"accuracy": 0.935, "f1": 0.925})
Key Takeaways
- Adversarial testing evaluates model robustness against attacks
- Quality tests check consistency, latency, and fairness
- Regression testing catches performance degradation
- Automated testing enables continuous quality monitoring
- Test coverage should include edge cases and diverse inputs