{
  "author": {
    "name": "David Mimno",
    "affiliation": "Cornell University, Department of Information Science",
    "url": "https://mimno.infosci.cornell.edu",
    "email": "mimno@cornell.edu",
    "research_areas": [
      "natural language processing",
      "topic modeling",
      "large language models",
      "computational humanities",
      "text mining",
      "machine learning"
    ]
  },
  "generated": "2025-06-18",
  "publications": [
{
  "title": "Elias in the Lighthouse, Again? Diagnosing Low Diversity in 
LLM Stories",
  "authors": ["Sil Hamilton", "David Mimno"],
  "year": 2026,
  "venue": "Preprint",
  "abstract": "LLM-generated stories are popular but show very low 
variability. Sampling 20,000 stories from four current models using five 
prompts, the authors find that 11 words occur in 88.3% of generated 
stories, with little difference between models. These words include names 
(Elias, Mara, Elara), settings (lighthouses), and professions (clockmaker, 
librarian). The pattern is traced to shared upstream training data and 
alignment fine-tuning pipelines, with implications for the perceived 
diversity of LLM creative output.",
  "arxiv": "https://arxiv.org/abs/2605.26492",
  "pdf": "https://arxiv.org/pdf/2605.26492",
  "tags": ["LLMs", "text generation", "diversity", "creative writing", 
"evaluation"]
},
{
  "title": "Are LLMs Overkill for Databases? A Study on the Finiteness of 
SQL",
  "authors": ["Yue Li", "David Mimno", "Unso Eun Seo Jo"],
  "year": 2026,
  "venue": "Preprint",
  "abstract": "Translating natural language to SQL has become more 
accessible thanks to code generation LLMs, but how hard is it really? 
While databases can become unbounded in complexity, the complexity of 
queries is bounded by real-life utility and human needs. With a sample of 
376 databases, this paper shows that SQL queries are finite in practical 
complexity, following a Power Law-like distribution where 70% of queries 
can be covered with just 13% of all template types. This suggests that 
while LLMs can be useful for text-to-SQL, they may be operating in a 
narrow, highly formulaic space where templates could be safer, cheaper, 
and more auditable.",
  "arxiv": "https://arxiv.org/abs/2603.25568",
  "pdf": "https://arxiv.org/pdf/2603.25568",
  "tags": ["LLMs", "text-to-SQL", "databases", "evaluation", "NLP"]
},
{
  "title": "Too Long, Didn't Model: Decomposing LLM Long-Context 
Understanding With Novels",
  "authors": ["Sil Hamilton", "Rebecca M. M. Hicke", "Matthew Wilkens", 
"David Mimno"],
  "year": 2025,
  "venue": "Preprint",
  "abstract": "Although LLM context lengths have increased to millions of 
tokens, evaluating their effectiveness beyond needle-in-a-haystack 
approaches has proven difficult. Novels provide a case study of subtle, 
complicated structure and long-range semantic dependencies often exceeding 
128k tokens. This paper releases the Too Long, Didn't Model (TLDM) 
benchmark, which tests a model's ability to report plot summary, 
storyworld configuration, and elapsed narrative time across 40 novels. 
None of seven tested frontier LLMs retain stable understanding beyond 64k 
tokens, suggesting model developers must look beyond existing long-context 
benchmarks.",
  "arxiv": "https://arxiv.org/abs/2505.14925",
  "pdf": "https://arxiv.org/pdf/2505.14925",
  "tags": ["LLMs", "long-context", "evaluation", "computational 
humanities", "benchmark"]
},
{
  "title": "Agent Bain vs. Agent McKinsey: A New Text-to-SQL Benchmark for 
the Business Domain",
  "authors": ["Yue Li", "Ran Tao", "Derek Hommel", "Yusuf Denizay Dönder", 
"Sungyong Chang", "David Mimno", "Unso Eun Seo Jo"],
  "year": 2026,
  "venue": "Under review",
  "abstract": "Text-to-SQL benchmarks have traditionally only tested 
simple data access. CORGI introduces a new benchmark that reflects 
practical database queries encountered by end users, with four levels of 
business query complexity: descriptive, explanatory, predictive, and 
recommendational. The benchmark requires causal reasoning, temporal 
forecasting, and strategic recommendation. LLMs show an average 33% lower 
success execution rate on CORGI compared to existing benchmarks such as 
BIRD, revealing a large gap between current benchmark performance and 
real-world business query needs.",
  "arxiv": "https://arxiv.org/abs/2510.07309",
  "pdf": "https://arxiv.org/pdf/2510.07309",
  "tags": ["LLMs", "text-to-SQL", "benchmark", "databases", "evaluation"]
},
    {
      "title": "The Zero Body Problem: Probing LLM Use of Sensory Language",
      "authors": ["Rebecca M. M. Hicke", "Sil Hamilton", "David Mimno"],
      "year": 2025,
      "venue": "COLM 2025",
      "venue_full": "Conference on Language Modeling",
      "abstract": "Sensory language expresses embodied experiences ranging from taste and sound to excitement and stomachache. This language is of interest to scholars from a wide range of domains including robotics, narratology, linguistics, and cognitive science. This paper explores whether language models, which are not embodied, can approximate human use of embodied language. The authors extend an existing corpus of parallel human and model responses to short story prompts with an additional 18,000 stories generated by 18 popular models. All models generate stories that differ significantly from human usage of sensory language, but the direction of these differences varies considerably between model families.",
      "arxiv": "https://arxiv.org/abs/2504.06393",
      "pdf": "https://arxiv.org/pdf/2504.06393",
      "tags": ["LLMs", "language", "embodiment", "narrative", "evaluation"]
    },
    {
      "title": "Large Language Models in Qualitative Research: Uses, Tensions, and Intentions",
      "authors": ["Hope Schroeder", "Marianne Aubin Le Quéré", "Casey Randazzo", "David Mimno", "Sarita Schoenebeck"],
      "year": 2025,
      "venue": "CHI 2025",
      "venue_full": "Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems",
      "doi": "https://doi.org/10.1145/3706598.3713120",
      "abstract": "Qualitative researchers use tools to collect, sort, and analyze their data. Should qualitative researchers use large language models (LLMs) as part of their practice? LLMs could augment qualitative research, but it is unclear if their use is appropriate, ethical, or aligned with qualitative researchers' goals and values. Twenty qualitative researchers were interviewed to investigate these tensions. Many participants see LLMs as promising interlocutors with attractive use cases across the stages of research, but wrestle with their performance and appropriateness. Participants surface concerns regarding the use of LLMs while protecting participant interests, and call attention to an urgent lack of norms and tooling to guide the ethical use of LLMs in research.",
      "arxiv": "https://arxiv.org/abs/2410.07362",
      "pdf": "https://arxiv.org/pdf/2410.07362",
      "tags": ["LLMs", "qualitative research", "HCI", "research methods", "ethics"]
    },
    {
      "title": "Data Paradigms in the Era of LLMs: On the Opportunities and Challenges of Qualitative Data in the WILD",
      "authors": ["Shengqi Zhu", "Jeffrey M. Rzeszotarski", "David Mimno"],
      "year": 2025,
      "venue": "CHI EA 2025",
      "venue_full": "Proceedings of the Extended Abstracts of the CHI Conference on Human Factors in Computing Systems",
      "doi": "https://doi.org/10.1145/3706599.3720285",
      "abstract": "This work considers current and future affordances for a prospective data paradigm for Human-AI Interaction studies: real-World Interactions as Large-scale Data sources (WILD). The paper envisions how the records of natural interactions with LLMs — a common yet uncontrolled product of contemporary chatbots — may serve as a comprehensive, sustainable data source for qualitative user studies. Opportunities are embedded in detailed discussions of the background and current form of such data, alongside practical directions for improvement and fundamental challenges.",
      "tags": ["LLMs", "qualitative data", "HCI", "Human-AI interaction", "data collection"]
    },
    {
      "title": "Provocations from the Humanities for Generative AI Research",
      "authors": ["Lauren F. Klein", "Meredith Martin", "André Brock", "Maria Antoniak", "Melanie Walsh", "Jessica Marie Johnson", "Lauren Tilton", "David Mimno"],
      "year": 2025,
      "venue": "Preprint",
      "abstract": "The effects of generative AI are experienced by a broad range of constituencies, but the disciplinary inputs to its development have been surprisingly narrow. This paper presents a set of provocations from humanities researchers — currently underrepresented in AI development — intended to inform its future applications and enrich ongoing conversations about its uses, impact, and harms.",
      "arxiv": "https://arxiv.org/abs/2502.19190",
      "pdf": "https://arxiv.org/pdf/2502.19190",
      "tags": ["generative AI", "humanities", "AI ethics", "cultural AI"]
    },
    {
      "title": "Looking for the Inner Music: Probing LLMs' Understanding of Literary Style",
      "authors": ["Rebecca M. M. Hicke", "David Mimno"],
      "year": 2025,
      "venue": "Preprint",
      "abstract": "This paper investigates whether large language models can identify and reason about literary style, probing the extent to which models capture the aesthetic and structural features that distinguish one author or genre from another.",
      "arxiv": "https://arxiv.org/abs/2502.03647",
      "tags": ["LLMs", "literary style", "computational humanities", "NLP", "evaluation"]
    },
    {
      "title": "Lost in Space: Optimizing Tokens for Grammar-Constrained Decoding",
      "authors": ["Sil Hamilton", "David Mimno"],
      "year": 2025,
      "venue": "Preprint",
      "abstract": "This paper examines grammar-constrained decoding in language models, focusing on how token choice affects efficiency and quality when outputs must conform to formal grammars or structured formats.",
      "arxiv": "https://arxiv.org/abs/2502.14969",
      "tags": ["LLMs", "decoding", "structured generation", "tokenization"]
    },
    {
      "title": "A City of Millions: Mapping Literary Social Networks At Scale",
      "authors": ["Sil Hamilton", "Rebecca M. M. Hicke", "David Mimno", "Matthew Wilkens"],
      "year": 2025,
      "venue": "Preprint",
      "abstract": "This paper presents methods for extracting and analyzing social networks from large literary corpora at scale, enabling macro-level analysis of character relationships and social structures across many texts.",
      "arxiv": "https://arxiv.org/abs/2502.19590",
      "tags": ["computational humanities", "social networks", "literary analysis", "NLP"]
    },
    {
      "title": "Do Chinese Models Speak Chinese Languages?",
      "authors": ["Andrea W. Wen-Yi", "Unso Eun Seo Jo", "David Mimno"],
      "year": 2025,
      "venue": "Preprint",
      "abstract": "The release of top-performing open-weight LLMs has cemented China's role as a leading force in AI development. This paper asks: do these models support languages spoken in China, or do they speak the same languages as Western models? Comparing multilingual capabilities reveals insights into pre-training data curation and the linguistic diversity of frontier models.",
      "arxiv": "https://arxiv.org/abs/2504.00289",
      "tags": ["LLMs", "multilingual", "Chinese", "language models", "evaluation"]
    },
    {
      "title": "Tasks and Roles in Legal AI: Data Curation, Annotation, and Verification",
      "authors": ["Allison Koenecke", "Edward H. Stiglitz", "David Mimno", "Matthew Wilkens"],
      "year": 2025,
      "venue": "Preprint",
      "abstract": "The application of AI tools to the legal field feels natural: large legal document collections could be used with specialized AI to improve workflow efficiency for lawyers and ameliorate the justice gap for underserved clients. However, legal documents differ from the web-based text that underlies most AI systems. This paper examines the specific challenges of legal AI with attention to data curation, annotation, and the verification of model outputs.",
      "arxiv": "https://arxiv.org/abs/2504.01349",
      "tags": ["legal AI", "NLP", "annotation", "data curation", "AI ethics"]
    },
    {
      "title": "Automate or Assist? The Role of Computational Models in Identifying Gendered Discourse in US Capital Trial Transcripts",
      "authors": ["Andrea W. Wen-Yi", "Kathryn Adamson", "Nathalie Greenfield", "Rachel Goldberg", "Sandra Babcock", "David Mimno", "Allison Koenecke"],
      "year": 2024,
      "venue": "AIES 2024",
      "venue_full": "Proceedings of the 2024 AAAI/ACM Conference on AI, Ethics, and Society",
      "doi": "https://doi.org/10.1609/aies.v7i1.31746",
      "award": "Best Student Paper",
      "abstract": "The language used by US courtroom actors in criminal trials has long been studied for biases. However, systematic studies of bias in high-stakes court trials have been difficult, due to the nuanced nature of bias and the legal expertise required. Large language models offer the possibility to automate annotation. But validating the computational approach requires both an understanding of how automated methods fit in existing annotation workflows and what they really offer. This paper presents a case study of adding a computational model to a complex and high-stakes problem: identifying gender-biased language in US capital trials for women defendants.",
      "arxiv": "https://arxiv.org/abs/2407.12500",
      "pdf": "https://arxiv.org/pdf/2407.12500",
      "tags": ["LLMs", "legal NLP", "gender bias", "annotation", "AI ethics"]
    },
    {
      "title": "Computational Humanities",
      "authors": ["Jessica Marie Johnson", "Lauren Tilton", "David Mimno"],
      "year": 2024,
      "venue": "Debates in Digital Humanities",
      "venue_full": "Debates in Digital Humanities, University of Minnesota Press",
      "abstract": "An edited volume bringing together scholarship at the intersection of computational methods and humanistic inquiry, addressing how computational tools and approaches can enrich humanities research while remaining attentive to questions of culture, power, and interpretation.",
      "tags": ["computational humanities", "digital humanities", "edited volume"]
    },
    {
      "title": "Humanities and Human-Centered Machine Learning",
      "authors": ["Laure Thompson", "David Mimno"],
      "year": 2023,
      "venue": "Human-Centered Machine Learning",
      "venue_full": "Chapter in Human-Centered Machine Learning",
      "pdf": "https://mimno.infosci.cornell.edu/papers/thompson2023hcml.pdf",
      "abstract": "This chapter examines how humanistic approaches can inform and improve machine learning systems, arguing for closer integration of humanities perspectives in the design, evaluation, and deployment of ML systems that interact with human users and cultural artifacts.",
      "tags": ["human-centered ML", "computational humanities", "NLP"]
    },
    {
      "title": "A Pretrainer's Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, & Toxicity",
      "authors": ["Shayne Longpre", "Gregory Yauney", "Emily Reif", "Katherine Lee", "Adam Roberts", "Barret Zoph", "Denny Zhou", "Jason Wei", "Kevin Robinson", "David Mimno", "Daphne Ippolito"],
      "year": 2023,
      "venue": "Preprint / EMNLP 2023",
      "abstract": "Choices in data curation make a huge difference to pre-trained language model performance and often go unreported or undervalued compared to architecture and hyperparameter choices. This paper systematically measures the effects of data age, domain coverage, quality filtering, and toxicity filtering on downstream model performance, providing practical guidance for practitioners building training datasets.",
      "arxiv": "https://arxiv.org/abs/2305.13169",
      "pdf": "https://arxiv.org/pdf/2305.13169",
      "tags": ["LLMs", "training data", "pre-training", "data curation", "language models"]
    },
    {
      "title": "Modeling Legal Reasoning: LM Annotation at the Edge of Human Agreement",
      "authors": ["Rosamond Elizabeth Thalken", "Edward Stiglitz", "David Mimno", "Matthew Wilkens"],
      "year": 2023,
      "venue": "EMNLP 2023",
      "venue_full": "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
      "abstract": "This paper examines how well language models can annotate legal texts for tasks that require fine-grained legal reasoning, focusing on cases where even human expert annotators disagree. The study finds that model performance tracks closely with the difficulty of the annotation task as measured by inter-annotator agreement, with implications for the use of LLMs in legal AI applications.",
      "tags": ["legal NLP", "LLMs", "annotation", "inter-annotator agreement"]
    },
    {
      "title": "Data Similarity is Not Enough to Explain Language Model Performance",
      "authors": ["Gregory Yauney", "Emily Reif", "David Mimno"],
      "year": 2023,
      "venue": "EMNLP 2023",
      "venue_full": "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
      "abstract": "While some level of similarity between training data and evaluation data helps (a model trained on English does less well on Swahili), this paper evaluates several commonly used metrics of training-to-evaluation similarity and finds that not only do they not correlate with few-shot performance, they do not correlate with each other. This calls into question the use of data similarity as a proxy for expected model behavior.",
      "arxiv": "https://arxiv.org/abs/2311.09006",
      "pdf": "https://arxiv.org/pdf/2311.09006",
      "tags": ["LLMs", "evaluation", "training data", "few-shot learning", "data similarity"]
    },
    {
      "title": "Hyperpolyglot LLMs: Cross-Lingual Interpretability in Token Embeddings",
      "authors": ["Andrea W. Wang", "David Mimno"],
      "year": 2023,
      "venue": "EMNLP 2023",
      "venue_full": "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
      "abstract": "LLMs trained on scores of languages can discover a global, cross-language semantic embedding space as an emergent property. This paper investigates the structure of token embeddings in multilingual LLMs, finding evidence of cross-lingual alignment that was not explicitly trained for, with implications for multilingual transfer learning and model interpretability.",
      "pdf": "https://arxiv.org/pdf/2305.13169",
      "tags": ["LLMs", "multilingual", "interpretability", "embeddings", "cross-lingual"]
    },
    {
      "title": "Contextualized Topic Coherence Metrics",
      "authors": ["Hamed Rahimi", "Jacob Louis Hoover", "David Mimno", "Hubert Naacke", "Camelia Constantin", "Bernd Amann"],
      "year": 2023,
      "venue": "EACL Findings 2023",
      "venue_full": "Findings of the Association for Computational Linguistics: EACL 2023",
      "abstract": "This paper introduces protocols for using LLMs to evaluate topic quality, developing contextualized coherence metrics that go beyond traditional word co-occurrence measures. The proposed metrics better capture whether topic words form semantically coherent groups when considered in context.",
      "arxiv": "https://arxiv.org/abs/2305.14587",
      "pdf": "https://arxiv.org/pdf/2305.14587",
      "tags": ["topic modeling", "evaluation", "LLMs", "coherence metrics"]
    },
    {
      "title": "The Chatbot and the Canon: Poetry Memorization in LLMs",
      "authors": ["Lyra D'Souza", "David Mimno"],
      "year": 2023,
      "venue": "CHR 2023",
      "venue_full": "Proceedings of the Computational Humanities Research Conference 2023",
      "pdf": "https://ceur-ws.org/Vol-3558/paper5712.pdf",
      "abstract": "This paper investigates which poems large language models memorize and why, finding that memorization correlates with poem frequency in training data and that the literary canon is unevenly represented — canonical poems are more likely to be memorized than less frequently anthologized works, raising questions about cultural bias in LLM training corpora.",
      "tags": ["LLMs", "memorization", "poetry", "computational humanities", "training data"]
    },
    {
      "title": "T5 Meets Tybalt: Author Attribution in Early Modern English Drama Using Large Language Models",
      "authors": ["Rebecca Hicke", "David Mimno"],
      "year": 2023,
      "venue": "CHR 2023",
      "venue_full": "Proceedings of the Computational Humanities Research Conference 2023",
      "abstract": "This paper applies large language models to the problem of author attribution in Early Modern English drama, using fine-tuned T5 models to distinguish between playwrights. The approach is evaluated on contested and collaborative plays from the period, including works attributed to Shakespeare and his contemporaries.",
      "tags": ["computational humanities", "authorship attribution", "Early Modern English", "LLMs", "NLP"]
    },
    {
      "title": "Comparing Text Representations: A Theory-Driven Approach",
      "authors": ["Gregory Yauney", "David Mimno"],
      "year": 2021,
      "venue": "EMNLP 2021",
      "venue_full": "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
      "pdf": "https://aclanthology.org/2021.emnlp-main.449.pdf",
      "abstract": "This paper develops a theory-driven framework for comparing text representations, arguing that evaluation of representations should be grounded in formal properties we care about rather than downstream task performance alone. The framework is applied to compare bag-of-words, embedding-based, and contextual representations across several dimensions.",
      "tags": ["text representations", "embeddings", "evaluation", "NLP theory"]
    },
    {
      "title": "On-the-Fly Rectification for Robust Large-Vocabulary Topic Inference",
      "authors": ["Moontae Lee", "Sungjun Cho", "Kun Dong", "David Mimno", "David Bindel"],
      "year": 2021,
      "venue": "ICML 2021",
      "venue_full": "Proceedings of the 38th International Conference on Machine Learning",
      "pdf": "http://proceedings.mlr.press/v139/lee21c/lee21c.pdf",
      "abstract": "This paper addresses the challenge of robust topic inference for large vocabularies, introducing an on-the-fly rectification approach that corrects for estimation errors during inference. The method improves topic quality without requiring retraining, particularly for low-frequency vocabulary items.",
      "tags": ["topic modeling", "NLP", "inference", "large vocabulary"]
    },
    {
      "title": "Separating the Wheat from the Chaff: A Topic and Keyword-based Procedure for Identifying Research-Relevant Text",
      "authors": ["Alicia Eads", "Alexandra Schofield", "Fauna Mahootian", "David Mimno", "Rens Wilderom"],
      "year": 2021,
      "venue": "Poetics",
      "abstract": "This paper develops a procedure combining topic models and keyword search to identify research-relevant documents within large text collections, providing a practical workflow for social scientists working with large document corpora.",
      "pdf": "https://mimno.infosci.cornell.edu/papers/eads-separating.pdf",
      "tags": ["topic modeling", "text classification", "social science", "information retrieval"]
    },
    {
      "title": "Tecnologica cosa: Modeling Storyteller Personalities in Boccaccio's Decameron",
      "authors": ["A. Feder Cooper", "Maria Antoniak", "Christopher De Sa", "Marilyn Migiel", "David Mimno"],
      "year": 2021,
      "venue": "LaTech-CLfL 2021",
      "arxiv": "https://arxiv.org/abs/2109.10506",
      "pdf": "https://arxiv.org/pdf/2109.10506",
      "abstract": "This paper applies computational methods to model the distinct storytelling personalities of the narrators in Boccaccio's Decameron, finding that the text encodes statistically distinguishable differences between characters' narrative styles.",
      "tags": ["computational humanities", "literary analysis", "Italian literature", "NLP"]
    }
  ]
}