@inproceedings{lum-etal-2025-bias,
title = "Bias in Language Models: Beyond Trick Tests and Towards {RUTE}d Evaluation",
author = "Lum, Kristian and
Anthis, Jacy Reese and
Robinson, Kevin and
Nagpal, Chirag and
D{'}Amour, Alexander Nicholas",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.7/",
pages = "137--161",
ISBN = "979-8-89176-251-0",
abstract = "Standard bias benchmarks used for large language models (LLMs) measure the association between social attributes in model inputs and single-word model outputs. We test whether these benchmarks are robust to lengthening the model outputs via a more realistic user prompt, in the commonly studied domain of gender-occupation bias, as a step towards measuring Realistic Use and Tangible Effects (i.e., RUTEd evaluations). From the current literature, we adapt three standard metrics of next-word prediction (neutrality, skew, and stereotype), and we develop analogous RUTEd evaluations in three contexts of real-world LLM use: children{'}s bedtime stories, user personas, and English language learning exercises. We find that standard bias metrics have no significant correlation with long-form output metrics. For example, selecting the least biased model based on the standard ``trick tests'' coincides with selecting the least biased model based on longer output no more than random chance. There may not yet be evidence to justify standard benchmarks as reliable proxies of real-world biases, and we encourage further development of context-specific RUTEd evaluations."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lum-etal-2025-bias">
<titleInfo>
<title>Bias in Language Models: Beyond Trick Tests and Towards RUTEd Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kristian</namePart>
<namePart type="family">Lum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacy</namePart>
<namePart type="given">Reese</namePart>
<namePart type="family">Anthis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Robinson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chirag</namePart>
<namePart type="family">Nagpal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="given">Nicholas</namePart>
<namePart type="family">D’Amour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Standard bias benchmarks used for large language models (LLMs) measure the association between social attributes in model inputs and single-word model outputs. We test whether these benchmarks are robust to lengthening the model outputs via a more realistic user prompt, in the commonly studied domain of gender-occupation bias, as a step towards measuring Realistic Use and Tangible Effects (i.e., RUTEd evaluations). From the current literature, we adapt three standard metrics of next-word prediction (neutrality, skew, and stereotype), and we develop analogous RUTEd evaluations in three contexts of real-world LLM use: children’s bedtime stories, user personas, and English language learning exercises. We find that standard bias metrics have no significant correlation with long-form output metrics. For example, selecting the least biased model based on the standard “trick tests” coincides with selecting the least biased model based on longer output no more than random chance. There may not yet be evidence to justify standard benchmarks as reliable proxies of real-world biases, and we encourage further development of context-specific RUTEd evaluations.</abstract>
<identifier type="citekey">lum-etal-2025-bias</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.7/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>137</start>
<end>161</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bias in Language Models: Beyond Trick Tests and Towards RUTEd Evaluation
%A Lum, Kristian
%A Anthis, Jacy Reese
%A Robinson, Kevin
%A Nagpal, Chirag
%A D’Amour, Alexander Nicholas
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F lum-etal-2025-bias
%X Standard bias benchmarks used for large language models (LLMs) measure the association between social attributes in model inputs and single-word model outputs. We test whether these benchmarks are robust to lengthening the model outputs via a more realistic user prompt, in the commonly studied domain of gender-occupation bias, as a step towards measuring Realistic Use and Tangible Effects (i.e., RUTEd evaluations). From the current literature, we adapt three standard metrics of next-word prediction (neutrality, skew, and stereotype), and we develop analogous RUTEd evaluations in three contexts of real-world LLM use: children’s bedtime stories, user personas, and English language learning exercises. We find that standard bias metrics have no significant correlation with long-form output metrics. For example, selecting the least biased model based on the standard “trick tests” coincides with selecting the least biased model based on longer output no more than random chance. There may not yet be evidence to justify standard benchmarks as reliable proxies of real-world biases, and we encourage further development of context-specific RUTEd evaluations.
%U https://aclanthology.org/2025.acl-long.7/
%P 137-161
Markdown (Informal)
[Bias in Language Models: Beyond Trick Tests and Towards RUTEd Evaluation](https://aclanthology.org/2025.acl-long.7/) (Lum et al., ACL 2025)
ACL
- Kristian Lum, Jacy Reese Anthis, Kevin Robinson, Chirag Nagpal, and Alexander Nicholas D’Amour. 2025. Bias in Language Models: Beyond Trick Tests and Towards RUTEd Evaluation. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 137–161, Vienna, Austria. Association for Computational Linguistics.