We extract some data from the “Deutsche ReferenzKorpus” (via COSMAS II) via manual querying. The resulting files are saved as .txt files in this folder.

Queries: - Internal I: :Ab:*?Innen: 241k tokens, 18k types (:Ab:*?In and :Ab:#REG(^[A-ZÄÖÜ][a-zäöüß]+In(nen)?$) throw errors) - Slash: #REG(^[A-ZÄÖÜ][a-zäöüß]+\/in(nen)?$): 136k tokens, 9k types - Star: #REG(^[A-ZÄÖÜ][a-zäöüß]+\*in(nen)?$): 48k tokens, 5k types - Colon: #REG(^[A-ZÄÖÜ][a-zäöüß]+:in(nen)?$): 10k tokens, 3k types - Underscore: #REG(^[A-ZÄÖÜ][a-zäöüß]+_in(nen)?$): 3k tokens, 1k types - Interpunct: #REG(^[A-ZÄÖÜ][a-zäöüß]+·in(nen)?$): 4(!) matches - Brackets: *?\(In\), *?\(Innen\), #REG(\(in(nen)\)) and similar queries throw errors

There is no machine-readable download on DeReKo to our knowledge (KorAP should do this, but is still work in progress), so we process the files a bit:

from typing import *
import re
import sys

sys.path.insert(0, "..")
from helpers import add_to_dict, log
from helpers_csv import csvs_to_list, dict_to_csvs

We want to keep only entries that are actually properly gendered, and we only want these properly gendered words, so we write some complicated regexes to find them:

match_properly_gendered_word = r"[A-ZÄÖÜ][a-zäöüß]{3,}(([/*:_·(]in(nen)?\)?)|In(nen)?)"


def is_properly_gendered_word(word: str) -> bool:
    return (
        re.findall(r"^[A-ZÄÖÜ][a-zäöüß]{3,}(([/*:_·(]in(nen)?\)?)|In(nen)?)$", word)
        != []
    )


assert is_properly_gendered_word("Bundeskanzler:innen") == True
assert is_properly_gendered_word("BundeskanzlerIn") == True
assert is_properly_gendered_word("Bundeskanzler*Innen") == False

And then we define some function specifically targeted at the structure of the DeReKo output files:

def dereko_to_csv(filename: str):
    text = open(filename + ".txt").read()
    lines = text.split("\n")[20:]
    words = [
        re.match(match_properly_gendered_word, line)[0]
        for line in lines
        if re.match(match_properly_gendered_word, line)
    ]
    open(filename + ".csv", "w").write("\n".join(words))
    return words


assert "Bundeskanzler*in" in dereko_to_csv("star")
dereko_to_csv("internal-i")[:5]
['AachenerInnen',
 'AbbiegerInnen',
 'AbbrecherInnen',
 'AbeitsplatzbesitzerInnen',
 'AbendländerInnen']
dereko_to_csv("colon")[:5]
['Abenteurer:innen',
 'Abiturient:innen',
 'Ablehner:innen',
 'Abnehmer:innen',
 'Abonennt:innen']

We want to distinguish singular and plural, which luckily is easy for gendered words:

def is_gendered_plural(word: str) -> str:
    return re.findall(r"[Ii]nnen\)?$", word) != []


assert is_gendered_plural("Bundeskanzler*in") == False
assert is_gendered_plural("Bundesminister/in") == False
assert is_gendered_plural("Bundesminister*innen") == True

And we want to ungender them. This also seems simple at first:

def male_sg(word: str) -> str:
    return re.sub(r"[/*:_·()]?[Ii]n(n(en))?$", "", word)


assert male_sg("Bundeskanzler*in") == "Bundeskanzler"
assert male_sg("Bundesminister*innen") == "Bundesminister"
def female_sg(word: str) -> str:
    return re.sub(r"[/*:_·()]?-?[Ii]n(nen)?$", r"in", word)


assert female_sg("Bundeskanzler*in") == "Bundeskanzlerin"
assert female_sg("Bundeskanzler:in") == "Bundeskanzlerin"
assert female_sg("Bundeskanzler_in") == "Bundeskanzlerin"
assert female_sg("Bundeskanzler/in") == "Bundeskanzlerin"
assert female_sg("Bundeskanzler/-in") == "Bundeskanzlerin"
assert female_sg("Bundeskanzler·in") == "Bundeskanzlerin"
assert female_sg("BundeskanzlerIn") == "Bundeskanzlerin"
assert female_sg("BundesministerIn") == "Bundesministerin"
assert female_sg("BundesministerInnen") == "Bundesministerin"

But then there’s also cases like these, where our method fails:

assert not male_sg("Abiturient*innen") == "Abiturienten"
assert not male_sg("Kollegin") == "Kollege"
count_dict = {}


def add_to_count_dict(key):
    if key in count_dict.keys():
        count_dict[key] += 1
    else:
        count_dict[key] = 1
dereko_lists = [
    dereko_to_csv(a)
    for a in ["colon", "internal-i", "interpunct", "slash", "star", "underscore"]
]

sg_count = 0
pl_count = 0
for l in dereko_lists:
    for word in l:
        if is_properly_gendered_word(word):
            if is_gendered_plural(word):
                pl_count += 1
            else:
                sg_count += 1
            add_to_count_dict(female_sg(word))

print("total gendered words in sg", sg_count)
print("total gendered words in pl", pl_count)
total gendered words in sg 4607
total gendered words in pl 20025
dic = {"sg": {}, "pl": {}}
for key, count in count_dict.items():
    if count >= 2:
        add_to_dict(male_sg(key), [key], dic["sg"])
        add_to_dict(male_sg(key), [key], dic["pl"])
dict_to_csvs(dic, "dereko_unified")

We check whether reading the data back to Python works well:

list_ = csvs_to_list("dereko_unified")
list_[:5]
[['Abbrecher', 'Abbrecherin', '0'],
 ['Abenteurer', 'Abenteurerin', '0'],
 ['Abfallberater', 'Abfallberaterin', '0'],
 ['Abgänger', 'Abgängerin', '0'],
 ['Abiturien', 'Abiturienin', '0']]