from typing import *
import re
import sys
0, "..")
sys.path.insert(from helpers import add_to_dict, log
from helpers_csv import csvs_to_list, dict_to_csvs
We extract some data from the “Deutsche ReferenzKorpus” (via COSMAS II) via manual querying. The resulting files are saved as .txt
files in this folder.
Queries: - Internal I: :Ab:*?Innen
: 241k tokens, 18k types (:Ab:*?In
and :Ab:#REG(^[A-ZÄÖÜ][a-zäöüß]+In(nen)?$)
throw errors) - Slash: #REG(^[A-ZÄÖÜ][a-zäöüß]+\/in(nen)?$)
: 136k tokens, 9k types - Star: #REG(^[A-ZÄÖÜ][a-zäöüß]+\*in(nen)?$)
: 48k tokens, 5k types - Colon: #REG(^[A-ZÄÖÜ][a-zäöüß]+:in(nen)?$)
: 10k tokens, 3k types - Underscore: #REG(^[A-ZÄÖÜ][a-zäöüß]+_in(nen)?$)
: 3k tokens, 1k types - Interpunct: #REG(^[A-ZÄÖÜ][a-zäöüß]+·in(nen)?$)
: 4(!) matches - Brackets: *?\(In\)
, *?\(Innen\)
, #REG(\(in(nen)\))
and similar queries throw errors
There is no machine-readable download on DeReKo to our knowledge (KorAP should do this, but is still work in progress), so we process the files a bit:
We want to keep only entries that are actually properly gendered, and we only want these properly gendered words, so we write some complicated regexes to find them:
= r"[A-ZÄÖÜ][a-zäöüß]{3,}(([/*:_·(]in(nen)?\)?)|In(nen)?)"
match_properly_gendered_word
def is_properly_gendered_word(word: str) -> bool:
return (
r"^[A-ZÄÖÜ][a-zäöüß]{3,}(([/*:_·(]in(nen)?\)?)|In(nen)?)$", word)
re.findall(!= []
)
assert is_properly_gendered_word("Bundeskanzler:innen") == True
assert is_properly_gendered_word("BundeskanzlerIn") == True
assert is_properly_gendered_word("Bundeskanzler*Innen") == False
And then we define some function specifically targeted at the structure of the DeReKo output files:
def dereko_to_csv(filename: str):
= open(filename + ".txt").read()
text = text.split("\n")[20:]
lines = [
words 0]
re.match(match_properly_gendered_word, line)[for line in lines
if re.match(match_properly_gendered_word, line)
]open(filename + ".csv", "w").write("\n".join(words))
return words
assert "Bundeskanzler*in" in dereko_to_csv("star")
"internal-i")[:5] dereko_to_csv(
['AachenerInnen',
'AbbiegerInnen',
'AbbrecherInnen',
'AbeitsplatzbesitzerInnen',
'AbendländerInnen']
"colon")[:5] dereko_to_csv(
['Abenteurer:innen',
'Abiturient:innen',
'Ablehner:innen',
'Abnehmer:innen',
'Abonennt:innen']
We want to distinguish singular and plural, which luckily is easy for gendered words:
def is_gendered_plural(word: str) -> str:
return re.findall(r"[Ii]nnen\)?$", word) != []
assert is_gendered_plural("Bundeskanzler*in") == False
assert is_gendered_plural("Bundesminister/in") == False
assert is_gendered_plural("Bundesminister*innen") == True
And we want to ungender them. This also seems simple at first:
def male_sg(word: str) -> str:
return re.sub(r"[/*:_·()]?[Ii]n(n(en))?$", "", word)
assert male_sg("Bundeskanzler*in") == "Bundeskanzler"
assert male_sg("Bundesminister*innen") == "Bundesminister"
def female_sg(word: str) -> str:
return re.sub(r"[/*:_·()]?-?[Ii]n(nen)?$", r"in", word)
assert female_sg("Bundeskanzler*in") == "Bundeskanzlerin"
assert female_sg("Bundeskanzler:in") == "Bundeskanzlerin"
assert female_sg("Bundeskanzler_in") == "Bundeskanzlerin"
assert female_sg("Bundeskanzler/in") == "Bundeskanzlerin"
assert female_sg("Bundeskanzler/-in") == "Bundeskanzlerin"
assert female_sg("Bundeskanzler·in") == "Bundeskanzlerin"
assert female_sg("BundeskanzlerIn") == "Bundeskanzlerin"
assert female_sg("BundesministerIn") == "Bundesministerin"
assert female_sg("BundesministerInnen") == "Bundesministerin"
But then there’s also cases like these, where our method fails:
assert not male_sg("Abiturient*innen") == "Abiturienten"
assert not male_sg("Kollegin") == "Kollege"
= {}
count_dict
def add_to_count_dict(key):
if key in count_dict.keys():
+= 1
count_dict[key] else:
= 1 count_dict[key]
= [
dereko_lists
dereko_to_csv(a)for a in ["colon", "internal-i", "interpunct", "slash", "star", "underscore"]
]
= 0
sg_count = 0
pl_count for l in dereko_lists:
for word in l:
if is_properly_gendered_word(word):
if is_gendered_plural(word):
+= 1
pl_count else:
+= 1
sg_count
add_to_count_dict(female_sg(word))
print("total gendered words in sg", sg_count)
print("total gendered words in pl", pl_count)
total gendered words in sg 4607
total gendered words in pl 20025
= {"sg": {}, "pl": {}}
dic for key, count in count_dict.items():
if count >= 2:
"sg"])
add_to_dict(male_sg(key), [key], dic["pl"]) add_to_dict(male_sg(key), [key], dic[
"dereko_unified") dict_to_csvs(dic,
We check whether reading the data back to Python works well:
= csvs_to_list("dereko_unified")
list_ 5] list_[:
[['Abbrecher', 'Abbrecherin', '0'],
['Abenteurer', 'Abenteurerin', '0'],
['Abfallberater', 'Abfallberaterin', '0'],
['Abgänger', 'Abgängerin', '0'],
['Abiturien', 'Abiturienin', '0']]