import pandas as pd
import re
import requests
import sys
from typing import *
sys.path.insert(0, "..")
from helpers import add_to_dict, log
from helpers_csv import csvs_to_list, dict_to_csvsexcel = requests.get("https://geschicktgendern.de/download/1642/").contentopen("geschicktgendern_raw.xlsx", "wb").write(excel)105817
df = pd.read_excel(
excel, header=None, names=["ungendered", "gendered"], skiprows=3, usecols=[1, 2]
)
df.sort_values(by="ungendered")
df.head()| ungendered | gendered | |
|---|---|---|
| 0 | <div id="A"><b>A</b><div> | NaN |
| 1 | Abbrecherquote | Abbruchquote |
| 2 | Abenteurer (sg.) | Waghals; abenteuerliebende Person; abenteuerlu... |
| 3 | Abgänger | absolvierende Person; Abschluss innehabende Pe... |
| 4 | Abiturient | Abitur ablegende Person; Person, die Abitur macht |
df.to_csv("geschicktgendern_raw.csv", index=False)
dflen = len(df)We drop rows like the first one, where there is merely some HTML description but no value.
df = df[df["gendered"].notna()]
df.head()| ungendered | gendered | |
|---|---|---|
| 1 | Abbrecherquote | Abbruchquote |
| 2 | Abenteurer (sg.) | Waghals; abenteuerliebende Person; abenteuerlu... |
| 3 | Abgänger | absolvierende Person; Abschluss innehabende Pe... |
| 4 | Abiturient | Abitur ablegende Person; Person, die Abitur macht |
| 5 | Abkömmling | abstammende Person; nachkommende Person; Kind;... |
Let’s look at a more complicated row:
df.loc[13]ungendered Absolventenvorsprechen [Schauspielschule]
gendered Abschlussvorsprechen; <div class="tooltip">Alu...
Name: 13, dtype: object
def clean(a):
a = re.sub(" ?\((sg|pl).?\) ?", "", a) # remove "(pl.)"
a = re.sub(" ?\.\.\. ?|…", "", a) # remove "..."
a = re.sub(" ?<[^>]*> ?", "", a) # remove tags
a = re.sub(" ", " ", a) # remove double spaces
a = re.sub("^ | $", "", a) # remove trailing spaces
if re.match("^[A-ZÄÖÜ][a-zäöüß]+ende$", a):
a = a[0].lower() + a[1:] + " Person"
return arecords = df.to_records()
dic: Dict[str, Dict[str, str]] = {"any": {}, "sg": {}, "pl": {}}
i = 0
for (_, ungendered, gendered) in records:
suggestions = gendered.split(";")
add_to_dict(clean(ungendered), [clean(s) for s in suggestions], dic["sg"])
i += 1print("#rules raw:", dflen)
print("#rules after dropping NAs:", len(df))
print("#rules after filtering too complicated:", i)#rules raw: 1859
#rules after dropping NAs: 1833
#rules after filtering too complicated: 1833
We save this as CSV:
dict_to_csvs(dic, "geschicktgendern")And we provide a method to parse the CSV to a dictionary again, so the file can be used easily in other scripts again:
list_ = csvs_to_list("geschicktgendern")
list_[:5][['(Deutscher / Welt-) Meister',
'Erster Platz der Deutschen Meisterschaft / Weltmeisterschaft',
'0'],
['(Deutscher / Welt-) Meister', 'Erstplatzierte', '0'],
['(Diplom-)Ingenieur', 'Person mit Ingenieursdiplom', '0'],
['(der) andere', 'Gegenüber', '0'],
['1000-Mann-Quote', '1000-Personen-Quote', '0']]