import pandas as pd
import re
import requests
import sys
from typing import *
0, "..")
sys.path.insert(from helpers import add_to_dict, log
from helpers_csv import csvs_to_list, dict_to_csvs
= requests.get("https://geschicktgendern.de/download/1642/").content excel
open("geschicktgendern_raw.xlsx", "wb").write(excel)
105817
= pd.read_excel(
df =None, names=["ungendered", "gendered"], skiprows=3, usecols=[1, 2]
excel, header
)="ungendered")
df.sort_values(by df.head()
ungendered | gendered | |
---|---|---|
0 | <div id="A"><b>A</b><div> | NaN |
1 | Abbrecherquote | Abbruchquote |
2 | Abenteurer (sg.) | Waghals; abenteuerliebende Person; abenteuerlu... |
3 | Abgänger | absolvierende Person; Abschluss innehabende Pe... |
4 | Abiturient | Abitur ablegende Person; Person, die Abitur macht |
"geschicktgendern_raw.csv", index=False)
df.to_csv(= len(df) dflen
We drop rows like the first one, where there is merely some HTML description but no value.
= df[df["gendered"].notna()]
df df.head()
ungendered | gendered | |
---|---|---|
1 | Abbrecherquote | Abbruchquote |
2 | Abenteurer (sg.) | Waghals; abenteuerliebende Person; abenteuerlu... |
3 | Abgänger | absolvierende Person; Abschluss innehabende Pe... |
4 | Abiturient | Abitur ablegende Person; Person, die Abitur macht |
5 | Abkömmling | abstammende Person; nachkommende Person; Kind;... |
Let’s look at a more complicated row:
13] df.loc[
ungendered Absolventenvorsprechen [Schauspielschule]
gendered Abschlussvorsprechen; <div class="tooltip">Alu...
Name: 13, dtype: object
def clean(a):
= re.sub(" ?\((sg|pl).?\) ?", "", a) # remove "(pl.)"
a = re.sub(" ?\.\.\. ?|…", "", a) # remove "..."
a = re.sub(" ?<[^>]*> ?", "", a) # remove tags
a = re.sub(" ", " ", a) # remove double spaces
a = re.sub("^ | $", "", a) # remove trailing spaces
a if re.match("^[A-ZÄÖÜ][a-zäöüß]+ende$", a):
= a[0].lower() + a[1:] + " Person"
a return a
= df.to_records()
records
str, Dict[str, str]] = {"any": {}, "sg": {}, "pl": {}}
dic: Dict[= 0
i for (_, ungendered, gendered) in records:
= gendered.split(";")
suggestions for s in suggestions], dic["sg"])
add_to_dict(clean(ungendered), [clean(s) += 1 i
print("#rules raw:", dflen)
print("#rules after dropping NAs:", len(df))
print("#rules after filtering too complicated:", i)
#rules raw: 1859
#rules after dropping NAs: 1833
#rules after filtering too complicated: 1833
We save this as CSV:
"geschicktgendern") dict_to_csvs(dic,
And we provide a method to parse the CSV to a dictionary again, so the file can be used easily in other scripts again:
= csvs_to_list("geschicktgendern")
list_ 5] list_[:
[['(Deutscher / Welt-) Meister',
'Erster Platz der Deutschen Meisterschaft / Weltmeisterschaft',
'0'],
['(Deutscher / Welt-) Meister', 'Erstplatzierte', '0'],
['(Diplom-)Ingenieur', 'Person mit Ingenieursdiplom', '0'],
['(der) andere', 'Gegenüber', '0'],
['1000-Mann-Quote', '1000-Personen-Quote', '0']]