import pandas as pd
import re
import requests
import sys
from typing import *

sys.path.insert(0, "..")
from helpers import add_to_dict, log
from helpers_csv import csvs_to_list, dict_to_csvs
excel = requests.get("https://geschicktgendern.de/download/1642/").content
open("geschicktgendern_raw.xlsx", "wb").write(excel)
105817
df = pd.read_excel(
    excel, header=None, names=["ungendered", "gendered"], skiprows=3, usecols=[1, 2]
)
df.sort_values(by="ungendered")
df.head()
ungendered gendered
0 <div id="A"><b>A</b><div> NaN
1 Abbrecherquote Abbruchquote
2 Abenteurer (sg.) Waghals; abenteuerliebende Person; abenteuerlu...
3 Abgänger absolvierende Person; Abschluss innehabende Pe...
4 Abiturient Abitur ablegende Person; Person, die Abitur macht
df.to_csv("geschicktgendern_raw.csv", index=False)
dflen = len(df)

We drop rows like the first one, where there is merely some HTML description but no value.

df = df[df["gendered"].notna()]
df.head()
ungendered gendered
1 Abbrecherquote Abbruchquote
2 Abenteurer (sg.) Waghals; abenteuerliebende Person; abenteuerlu...
3 Abgänger absolvierende Person; Abschluss innehabende Pe...
4 Abiturient Abitur ablegende Person; Person, die Abitur macht
5 Abkömmling abstammende Person; nachkommende Person; Kind;...

Let’s look at a more complicated row:

df.loc[13]
ungendered            Absolventenvorsprechen [Schauspielschule]
gendered      Abschlussvorsprechen; <div class="tooltip">Alu...
Name: 13, dtype: object
def clean(a):
    a = re.sub(" ?\((sg|pl).?\) ?", "", a)  # remove "(pl.)"
    a = re.sub(" ?\.\.\. ?|…", "", a)  # remove "..."
    a = re.sub(" ?<[^>]*> ?", "", a)  # remove tags
    a = re.sub("  ", " ", a)  # remove double spaces
    a = re.sub("^ | $", "", a)  # remove trailing spaces
    if re.match("^[A-ZÄÖÜ][a-zäöüß]+ende$", a):
        a = a[0].lower() + a[1:] + " Person"
    return a
records = df.to_records()

dic: Dict[str, Dict[str, str]] = {"any": {}, "sg": {}, "pl": {}}
i = 0
for (_, ungendered, gendered) in records:
    suggestions = gendered.split(";")
    add_to_dict(clean(ungendered), [clean(s) for s in suggestions], dic["sg"])
    i += 1
print("#rules raw:", dflen)
print("#rules after dropping NAs:", len(df))
print("#rules after filtering too complicated:", i)
#rules raw: 1859
#rules after dropping NAs: 1833
#rules after filtering too complicated: 1833

We save this as CSV:

dict_to_csvs(dic, "geschicktgendern")

And we provide a method to parse the CSV to a dictionary again, so the file can be used easily in other scripts again:

list_ = csvs_to_list("geschicktgendern")
list_[:5]
[['(Deutscher / Welt-) Meister',
  'Erster Platz der Deutschen Meisterschaft / Weltmeisterschaft',
  '0'],
 ['(Deutscher / Welt-) Meister', 'Erstplatzierte', '0'],
 ['(Diplom-)Ingenieur', 'Person mit Ingenieursdiplom', '0'],
 ['(der) andere', 'Gegenüber', '0'],
 ['1000-Mann-Quote', '1000-Personen-Quote', '0']]