diff --git a/.gitignore b/.gitignore
index 0869605..f533e4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ ENV/
*.zip
*.txt
*.csv
+*.xlsx
diff --git a/Professionals_Activities_Inconsistencies.ipynb b/Professionals_Activities_Inconsistencies.ipynb
index 39dde05..81de874 100644
--- a/Professionals_Activities_Inconsistencies.ipynb
+++ b/Professionals_Activities_Inconsistencies.ipynb
@@ -1,30 +1,15 @@
{
"cells": [
{
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "58db5082e27759f7",
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T22:45:53.794236Z",
"start_time": "2025-08-19T22:45:51.445477Z"
}
},
- "cell_type": "code",
- "source": [
- "%%time\n",
- "import polars as pd\n",
- "import csv\n",
- "folder = \"E:\\\\Ziwig Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\\"\n",
- "input_file = \"Table_Réf_Professionnels_250815.txt\"\n",
- "output_file = \"Table_Réf_Professionnels_inconsistencies\"\n",
- "output_extension = \".csv\"\n",
- "df = pd.read_csv(f\"{folder}{input_file}\",\n",
- " separator='|',\n",
- " quote_char=None,\n",
- " null_values='',\n",
- " infer_schema_length=0) # Read all columns as strings\n",
- "df = df.with_row_index('index')\n",
- "df.columns\n"
- ],
- "id": "58db5082e27759f7",
"outputs": [
{
"name": "stdout",
@@ -102,32 +87,33 @@
"output_type": "execute_result"
}
],
- "execution_count": 57
+ "source": [
+ "%%time\n",
+ "import polars as pd\n",
+ "import csv\n",
+ "folder = \"E:\\\\Ziwig Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\\"\n",
+ "input_file = \"Table_Réf_Professionnels_260319.txt\"\n",
+ "output_file = \"Table_Réf_Professionnels_inconsistencies_new\"\n",
+ "output_extension = \".csv\"\n",
+ "df = pd.read_csv(f\"{folder}{input_file}\",\n",
+ " separator='|',\n",
+ " quote_char=None,\n",
+ " null_values='',\n",
+ " infer_schema_length=0) # Read all columns as strings\n",
+ "df = df.with_row_index('index')\n",
+ "df.columns\n"
+ ]
},
{
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "7d9b7562c09955",
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T22:45:54.952210Z",
"start_time": "2025-08-19T22:45:53.873718Z"
}
},
- "cell_type": "code",
- "source": [
- "%%time\n",
- "df1 = (\n",
- " df.with_columns(\n",
- " (pd.col(\"Nom d'exercice\") + \" \" + pd.col(\"Prénom d'exercice\")).alias(\"Nom_Prénom\")\n",
- " )\n",
- " .with_columns(\n",
- " pd.col(\"Nom_Prénom\").n_unique().over(\"Identifiant PP\").alias(\"Count\")\n",
- " )\n",
- " .filter(pd.col(\"Count\") > 1)\n",
- " .sort([\"Identifiant PP\", \"index\"])\n",
- " .select(\"index\", \"Identifiant PP\", \"Count\", \"Nom d'exercice\", \"Prénom d'exercice\")\n",
- ")\n",
- "df1\n"
- ],
- "id": "7d9b7562c09955",
"outputs": [
{
"name": "stdout",
@@ -139,6 +125,16 @@
},
{
"data": {
+ "text/html": [
+ "
\n",
+ "
shape: (9_108, 5)| index | Identifiant PP | Count | Nom d'exercice | Prénom d'exercice |
|---|
| u32 | str | u32 | str | str |
| 1352933 | "10000034180" | 2 | "DUWAT-GEORGES" | "GHISLAINE" |
| 1352934 | "10000034180" | 2 | "GEORGES" | "GHISLAINE" |
| 261 | "10000040062" | 2 | "MEYER" | "Nicolas" |
| 262 | "10000040062" | 2 | "MEYER" | "Nicolas" |
| 811196 | "10000040062" | 2 | "MEYER" | "NICOLAS" |
| … | … | … | … | … |
| 2162425 | "10111110721" | 2 | "BARREAU" | "Nadège" |
| 268237 | "10111112636" | 2 | "GIRAUDET" | "MEGGIE" |
| 1892318 | "10111112636" | 2 | "GIRAUDET" | "Meggie" |
| 269544 | "10111320304" | 2 | "Sengel" | "Coralie" |
| 1352396 | "10111320304" | 2 | "SENGEL" | "Coralie" |
"
+ ],
"text/plain": [
"shape: (9_108, 5)\n",
"┌─────────┬────────────────┬───────┬────────────────┬───────────────────┐\n",
@@ -158,16 +154,6 @@
"│ 269544 ┆ 10111320304 ┆ 2 ┆ Sengel ┆ Coralie │\n",
"│ 1352396 ┆ 10111320304 ┆ 2 ┆ SENGEL ┆ Coralie │\n",
"└─────────┴────────────────┴───────┴────────────────┴───────────────────┘"
- ],
- "text/html": [
- "\n",
- "
shape: (9_108, 5)| index | Identifiant PP | Count | Nom d'exercice | Prénom d'exercice |
|---|
| u32 | str | u32 | str | str |
| 1352933 | "10000034180" | 2 | "DUWAT-GEORGES" | "GHISLAINE" |
| 1352934 | "10000034180" | 2 | "GEORGES" | "GHISLAINE" |
| 261 | "10000040062" | 2 | "MEYER" | "Nicolas" |
| 262 | "10000040062" | 2 | "MEYER" | "Nicolas" |
| 811196 | "10000040062" | 2 | "MEYER" | "NICOLAS" |
| … | … | … | … | … |
| 2162425 | "10111110721" | 2 | "BARREAU" | "Nadège" |
| 268237 | "10111112636" | 2 | "GIRAUDET" | "MEGGIE" |
| 1892318 | "10111112636" | 2 | "GIRAUDET" | "Meggie" |
| 269544 | "10111320304" | 2 | "Sengel" | "Coralie" |
| 1352396 | "10111320304" | 2 | "SENGEL" | "Coralie" |
"
]
},
"execution_count": 58,
@@ -175,39 +161,11 @@
"output_type": "execute_result"
}
],
- "execution_count": 58
- },
- {
- "metadata": {
- "ExecuteTime": {
- "end_time": "2025-08-19T22:45:55.090712Z",
- "start_time": "2025-08-19T22:45:55.072647Z"
- }
- },
- "cell_type": "code",
- "source": [
- "df1.write_csv(f\"{folder}{output_file}-Names_Variations_Strict{output_extension}\",\n",
- " separator='|',\n",
- " quote_style=\"never\",\n",
- " line_terminator='\\n')\n"
- ],
- "id": "c418a6ea7abd77b",
- "outputs": [],
- "execution_count": 59
- },
- {
- "metadata": {
- "ExecuteTime": {
- "end_time": "2025-08-19T22:45:56.962873Z",
- "start_time": "2025-08-19T22:45:55.259223Z"
- }
- },
- "cell_type": "code",
"source": [
"%%time\n",
- "df2 = (\n",
+ "df1 = (\n",
" df.with_columns(\n",
- " (pd.col(\"Nom d'exercice\").str.to_lowercase() + \" \" + pd.col(\"Prénom d'exercice\").str.to_lowercase()).alias(\"Nom_Prénom\")\n",
+ " (pd.col(\"Nom d'exercice\") + \" \" + pd.col(\"Prénom d'exercice\")).alias(\"Nom_Prénom\")\n",
" )\n",
" .with_columns(\n",
" pd.col(\"Nom_Prénom\").n_unique().over(\"Identifiant PP\").alias(\"Count\")\n",
@@ -216,9 +174,37 @@
" .sort([\"Identifiant PP\", \"index\"])\n",
" .select(\"index\", \"Identifiant PP\", \"Count\", \"Nom d'exercice\", \"Prénom d'exercice\")\n",
")\n",
- "df2\n"
- ],
+ "df1\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "c418a6ea7abd77b",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-08-19T22:45:55.090712Z",
+ "start_time": "2025-08-19T22:45:55.072647Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "df1.write_csv(f\"{folder}{output_file}-Names_Variations_Strict{output_extension}\",\n",
+ " separator='|',\n",
+ " quote_style=\"never\",\n",
+ " line_terminator='\\n')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
"id": "9d94b716364356c7",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-08-19T22:45:56.962873Z",
+ "start_time": "2025-08-19T22:45:55.259223Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -230,6 +216,16 @@
},
{
"data": {
+ "text/html": [
+ "\n",
+ "
shape: (5_426, 5)| index | Identifiant PP | Count | Nom d'exercice | Prénom d'exercice |
|---|
| u32 | str | u32 | str | str |
| 1352933 | "10000034180" | 2 | "DUWAT-GEORGES" | "GHISLAINE" |
| 1352934 | "10000034180" | 2 | "GEORGES" | "GHISLAINE" |
| 1353009 | "10000046051" | 2 | "STUDER" | "AGNES" |
| 1623173 | "10000046051" | 2 | "JURION" | "AGNES" |
| 270462 | "10000101518" | 2 | "BARREYRE" | "SANDRINE" |
| … | … | … | … | … |
| 1619731 | "10110987236" | 2 | "ROGIER" | "MATHILDE" |
| 808810 | "10111077417" | 2 | "DOUVIER" | "FRANCETTE" |
| 2161999 | "10111077417" | 2 | "D'ELLOY" | "FRANCETTE" |
| 538415 | "10111110721" | 2 | "ROCHEPEAU" | "Nadège" |
| 2162425 | "10111110721" | 2 | "BARREAU" | "Nadège" |
"
+ ],
"text/plain": [
"shape: (5_426, 5)\n",
"┌─────────┬────────────────┬───────┬────────────────┬───────────────────┐\n",
@@ -249,7 +245,68 @@
"│ 538415 ┆ 10111110721 ┆ 2 ┆ ROCHEPEAU ┆ Nadège │\n",
"│ 2162425 ┆ 10111110721 ┆ 2 ┆ BARREAU ┆ Nadège │\n",
"└─────────┴────────────────┴───────┴────────────────┴───────────────────┘"
- ],
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "df2 = (\n",
+ " df.with_columns(\n",
+ " (pd.col(\"Nom d'exercice\").str.to_lowercase() + \" \" + pd.col(\"Prénom d'exercice\").str.to_lowercase()).alias(\"Nom_Prénom\")\n",
+ " )\n",
+ " .with_columns(\n",
+ " pd.col(\"Nom_Prénom\").n_unique().over(\"Identifiant PP\").alias(\"Count\")\n",
+ " )\n",
+ " .filter(pd.col(\"Count\") > 1)\n",
+ " .sort([\"Identifiant PP\", \"index\"])\n",
+ " .select(\"index\", \"Identifiant PP\", \"Count\", \"Nom d'exercice\", \"Prénom d'exercice\")\n",
+ ")\n",
+ "df2\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "18aab4499103491a",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-08-19T22:45:57.433036Z",
+ "start_time": "2025-08-19T22:45:57.417970Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "df2.write_csv(f\"{folder}{output_file}-Names_Variations_Insensitive{output_extension}\",\n",
+ " separator='|',\n",
+ " quote_style=\"never\",\n",
+ " line_terminator='\\n')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "8e4e3e22f16fea1c",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-08-19T22:46:02.915526Z",
+ "start_time": "2025-08-19T22:45:57.710258Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: total: 8.59 s\n",
+ "Wall time: 5.19 s\n"
+ ]
+ },
+ {
+ "data": {
"text/html": [
"\n",
- "
shape: (5_426, 5)| index | Identifiant PP | Count | Nom d'exercice | Prénom d'exercice |
|---|
| u32 | str | u32 | str | str |
| 1352933 | "10000034180" | 2 | "DUWAT-GEORGES" | "GHISLAINE" |
| 1352934 | "10000034180" | 2 | "GEORGES" | "GHISLAINE" |
| 1353009 | "10000046051" | 2 | "STUDER" | "AGNES" |
| 1623173 | "10000046051" | 2 | "JURION" | "AGNES" |
| 270462 | "10000101518" | 2 | "BARREYRE" | "SANDRINE" |
| … | … | … | … | … |
| 1619731 | "10110987236" | 2 | "ROGIER" | "MATHILDE" |
| 808810 | "10111077417" | 2 | "DOUVIER" | "FRANCETTE" |
| 2161999 | "10111077417" | 2 | "D'ELLOY" | "FRANCETTE" |
| 538415 | "10111110721" | 2 | "ROCHEPEAU" | "Nadège" |
| 2162425 | "10111110721" | 2 | "BARREAU" | "Nadège" |
"
+ "shape: (3_584, 6)| index | Identifiant PP | Count | Nom d'exercice | Prénom d'exercice | Nom_Prénom_Nettoyé |
|---|
| u32 | str | u32 | str | str | str |
| 1352933 | "10000034180" | 2 | "DUWAT-GEORGES" | "GHISLAINE" | "duwat georges ghislaine" |
| 1352934 | "10000034180" | 2 | "GEORGES" | "GHISLAINE" | "georges ghislaine" |
| 1353009 | "10000046051" | 2 | "STUDER" | "AGNES" | "studer agnes" |
| 1623173 | "10000046051" | 2 | "JURION" | "AGNES" | "jurion agnes" |
| 270462 | "10000101518" | 2 | "BARREYRE" | "SANDRINE" | "barreyre sandrine" |
| … | … | … | … | … | … |
| 1619731 | "10110987236" | 2 | "ROGIER" | "MATHILDE" | "rogier mathilde" |
| 808810 | "10111077417" | 2 | "DOUVIER" | "FRANCETTE" | "douvier francette" |
| 2161999 | "10111077417" | 2 | "D'ELLOY" | "FRANCETTE" | "d elloy francette" |
| 538415 | "10111110721" | 2 | "ROCHEPEAU" | "Nadège" | "rochepeau nadege" |
| 2162425 | "10111110721" | 2 | "BARREAU" | "Nadège" | "barreau nadege" |
"
+ ],
+ "text/plain": [
+ "shape: (3_584, 6)\n",
+ "┌─────────┬────────────────┬───────┬────────────────┬───────────────────┬─────────────────────────┐\n",
+ "│ index ┆ Identifiant PP ┆ Count ┆ Nom d'exercice ┆ Prénom d'exercice ┆ Nom_Prénom_Nettoyé │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n",
+ "╞═════════╪════════════════╪═══════╪════════════════╪═══════════════════╪═════════════════════════╡\n",
+ "│ 1352933 ┆ 10000034180 ┆ 2 ┆ DUWAT-GEORGES ┆ GHISLAINE ┆ duwat georges ghislaine │\n",
+ "│ 1352934 ┆ 10000034180 ┆ 2 ┆ GEORGES ┆ GHISLAINE ┆ georges ghislaine │\n",
+ "│ 1353009 ┆ 10000046051 ┆ 2 ┆ STUDER ┆ AGNES ┆ studer agnes │\n",
+ "│ 1623173 ┆ 10000046051 ┆ 2 ┆ JURION ┆ AGNES ┆ jurion agnes │\n",
+ "│ 270462 ┆ 10000101518 ┆ 2 ┆ BARREYRE ┆ SANDRINE ┆ barreyre sandrine │\n",
+ "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
+ "│ 1619731 ┆ 10110987236 ┆ 2 ┆ ROGIER ┆ MATHILDE ┆ rogier mathilde │\n",
+ "│ 808810 ┆ 10111077417 ┆ 2 ┆ DOUVIER ┆ FRANCETTE ┆ douvier francette │\n",
+ "│ 2161999 ┆ 10111077417 ┆ 2 ┆ D'ELLOY ┆ FRANCETTE ┆ d elloy francette │\n",
+ "│ 538415 ┆ 10111110721 ┆ 2 ┆ ROCHEPEAU ┆ Nadège ┆ rochepeau nadege │\n",
+ "│ 2162425 ┆ 10111110721 ┆ 2 ┆ BARREAU ┆ Nadège ┆ barreau nadege │\n",
+ "└─────────┴────────────────┴───────┴────────────────┴───────────────────┴─────────────────────────┘"
]
},
- "execution_count": 60,
+ "execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
- "execution_count": 60
- },
- {
- "metadata": {
- "ExecuteTime": {
- "end_time": "2025-08-19T22:45:57.433036Z",
- "start_time": "2025-08-19T22:45:57.417970Z"
- }
- },
- "cell_type": "code",
- "source": [
- "df2.write_csv(f\"{folder}{output_file}-Names_Variations_Insensitive{output_extension}\",\n",
- " separator='|',\n",
- " quote_style=\"never\",\n",
- " line_terminator='\\n')\n"
- ],
- "id": "18aab4499103491a",
- "outputs": [],
- "execution_count": 61
- },
- {
- "metadata": {
- "ExecuteTime": {
- "end_time": "2025-08-19T22:46:02.915526Z",
- "start_time": "2025-08-19T22:45:57.710258Z"
- }
- },
- "cell_type": "code",
"source": [
"%%time\n",
"df3 = (\n",
@@ -325,121 +374,36 @@
" .select(\"index\", \"Identifiant PP\", \"Count\", \"Nom d'exercice\", \"Prénom d'exercice\", \"Nom_Prénom_Nettoyé\")\n",
")\n",
"df3\n"
- ],
- "id": "8e4e3e22f16fea1c",
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: total: 8.59 s\n",
- "Wall time: 5.19 s\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "shape: (3_584, 6)\n",
- "┌─────────┬────────────────┬───────┬────────────────┬───────────────────┬─────────────────────────┐\n",
- "│ index ┆ Identifiant PP ┆ Count ┆ Nom d'exercice ┆ Prénom d'exercice ┆ Nom_Prénom_Nettoyé │\n",
- "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
- "│ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n",
- "╞═════════╪════════════════╪═══════╪════════════════╪═══════════════════╪═════════════════════════╡\n",
- "│ 1352933 ┆ 10000034180 ┆ 2 ┆ DUWAT-GEORGES ┆ GHISLAINE ┆ duwat georges ghislaine │\n",
- "│ 1352934 ┆ 10000034180 ┆ 2 ┆ GEORGES ┆ GHISLAINE ┆ georges ghislaine │\n",
- "│ 1353009 ┆ 10000046051 ┆ 2 ┆ STUDER ┆ AGNES ┆ studer agnes │\n",
- "│ 1623173 ┆ 10000046051 ┆ 2 ┆ JURION ┆ AGNES ┆ jurion agnes │\n",
- "│ 270462 ┆ 10000101518 ┆ 2 ┆ BARREYRE ┆ SANDRINE ┆ barreyre sandrine │\n",
- "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
- "│ 1619731 ┆ 10110987236 ┆ 2 ┆ ROGIER ┆ MATHILDE ┆ rogier mathilde │\n",
- "│ 808810 ┆ 10111077417 ┆ 2 ┆ DOUVIER ┆ FRANCETTE ┆ douvier francette │\n",
- "│ 2161999 ┆ 10111077417 ┆ 2 ┆ D'ELLOY ┆ FRANCETTE ┆ d elloy francette │\n",
- "│ 538415 ┆ 10111110721 ┆ 2 ┆ ROCHEPEAU ┆ Nadège ┆ rochepeau nadege │\n",
- "│ 2162425 ┆ 10111110721 ┆ 2 ┆ BARREAU ┆ Nadège ┆ barreau nadege │\n",
- "└─────────┴────────────────┴───────┴────────────────┴───────────────────┴─────────────────────────┘"
- ],
- "text/html": [
- "\n",
- "
shape: (3_584, 6)| index | Identifiant PP | Count | Nom d'exercice | Prénom d'exercice | Nom_Prénom_Nettoyé |
|---|
| u32 | str | u32 | str | str | str |
| 1352933 | "10000034180" | 2 | "DUWAT-GEORGES" | "GHISLAINE" | "duwat georges ghislaine" |
| 1352934 | "10000034180" | 2 | "GEORGES" | "GHISLAINE" | "georges ghislaine" |
| 1353009 | "10000046051" | 2 | "STUDER" | "AGNES" | "studer agnes" |
| 1623173 | "10000046051" | 2 | "JURION" | "AGNES" | "jurion agnes" |
| 270462 | "10000101518" | 2 | "BARREYRE" | "SANDRINE" | "barreyre sandrine" |
| … | … | … | … | … | … |
| 1619731 | "10110987236" | 2 | "ROGIER" | "MATHILDE" | "rogier mathilde" |
| 808810 | "10111077417" | 2 | "DOUVIER" | "FRANCETTE" | "douvier francette" |
| 2161999 | "10111077417" | 2 | "D'ELLOY" | "FRANCETTE" | "d elloy francette" |
| 538415 | "10111110721" | 2 | "ROCHEPEAU" | "Nadège" | "rochepeau nadege" |
| 2162425 | "10111110721" | 2 | "BARREAU" | "Nadège" | "barreau nadege" |
"
- ]
- },
- "execution_count": 62,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "execution_count": 62
+ ]
},
{
+ "cell_type": "code",
+ "execution_count": 63,
+ "id": "aab2ae2e91a7190c",
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T22:46:03.290835Z",
"start_time": "2025-08-19T22:46:03.280259Z"
}
},
- "cell_type": "code",
+ "outputs": [],
"source": [
"df3.write_csv(f\"{folder}{output_file}-Names_Variations_Normalized{output_extension}\",\n",
" separator='|',\n",
" quote_style=\"never\",\n",
" line_terminator='\\n')\n"
- ],
- "id": "aab2ae2e91a7190c",
- "outputs": [],
- "execution_count": 63
+ ]
},
{
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "3c2f2bb5fc3c2a5e",
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T22:46:07.814563Z",
"start_time": "2025-08-19T22:46:03.493442Z"
}
},
- "cell_type": "code",
- "source": [
- "%%time\n",
- "all_columns = df.columns\n",
- "start_col = 'Raison sociale site'\n",
- "end_col = \"Libellé secteur d'activité\"\n",
- "start_col_index = all_columns.index(start_col)\n",
- "end_col_index = all_columns.index(end_col)\n",
- "site_info_cols = all_columns[start_col_index : end_col_index + 1]\n",
- "if \"Autorité d'enregistrement\" in site_info_cols:\n",
- " site_info_cols.remove(\"Autorité d'enregistrement\")\n",
- "\n",
- "df4 = (\n",
- " df\n",
- " .filter(pd.col('Numéro FINESS site').is_null())\n",
- " .filter(\n",
- " pd.col('Numéro SIRET site').is_not_null() | pd.col('Identifiant technique de la structure').is_not_null()\n",
- " )\n",
- " .with_columns(\n",
- " pd.coalesce(\n",
- " pd.col('Numéro SIRET site'),\n",
- " pd.col('Identifiant technique de la structure')\n",
- " ).alias('Site_Identifier')\n",
- " )\n",
- " .with_columns(\n",
- " pd.struct(site_info_cols).n_unique().over(['Identifiant PP', 'Site_Identifier']).alias('Site_Info_Variations_Count')\n",
- " )\n",
- " .filter(pd.col('Site_Info_Variations_Count') > 1)\n",
- " .sort(['Identifiant PP', 'Site_Identifier', 'index'])\n",
- " .select(['index', \n",
- " 'Identifiant PP', \n",
- " 'Site_Identifier', \n",
- " 'Numéro SIRET site', \n",
- " 'Identifiant technique de la structure', \n",
- " 'Site_Info_Variations_Count'] + site_info_cols)\n",
- ")\n",
- "df4"
- ],
- "id": "3c2f2bb5fc3c2a5e",
"outputs": [
{
"name": "stdout",
@@ -451,6 +415,16 @@
},
{
"data": {
+ "text/html": [
+ "\n",
+ "
shape: (98, 31)| index | Identifiant PP | Site_Identifier | Numéro SIRET site | Identifiant technique de la structure | Site_Info_Variations_Count | Raison sociale site | Enseigne commerciale site | Complément destinataire (coord. structure) | Complément point géographique (coord. structure) | Numéro Voie (coord. structure) | Indice répétition voie (coord. structure) | Code type de voie (coord. structure) | Libellé type de voie (coord. structure) | Libellé Voie (coord. structure) | Mention distribution (coord. structure) | Bureau cedex (coord. structure) | Code postal (coord. structure) | Code commune (coord. structure) | Libellé commune (coord. structure) | Code pays (coord. structure) | Libellé pays (coord. structure) | Téléphone (coord. structure) | Téléphone 2 (coord. structure) | Télécopie (coord. structure) | Adresse e-mail (coord. structure) | Code Département (structure) | Libellé Département (structure) | Ancien identifiant de la structure | Code secteur d'activité | Libellé secteur d'activité |
|---|
| u32 | str | str | str | str | u32 | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str |
| 270597 | "10000116383" | "30980220500505" | "30980220500505" | "R10100000198782" | 2 | "OEUVRES HOSPITALIERES FRANCAIS… | "ORDRE DE MALTE FRANCE" | "ORDRE DE MALTE FRANCE" | null | "42" | null | null | null | "RUE DES VOLONTAIRES" | null | "75015 PARIS" | "75015" | "75056" | "Paris" | "99000" | "France" | null | null | null | null | null | null | "330980220500505" | "SA28" | "Asso et orga humanitaire" |
| 1353470 | "10000116383" | "30980220500505" | "30980220500505" | "R10100000779807" | 2 | "OEUVRE HOSP FRANC DE L'ORDRE D… | null | null | null | "49" | null | "R" | "Rue" | "DE LA CHAPELLE" | null | "75018 PARIS 18E ARRONDISSEMEN… | "75018" | "75118" | "Paris 18e Arrondissement" | null | null | null | null | null | null | null | null | "330980220500505" | "SA28" | "Asso et orga humanitaire" |
| 4214 | "10000536309" | "18003502402369" | "18003502402369" | "R10100000050224" | 2 | "DRSM PAYS DE LA LOIREELSM 44" | "SITE NANTES" | null | null | "9" | null | "R" | "Rue" | "GAETAN RONDEAU" | "BP" | "44203 NANTES" | "44203" | "44109" | "Nantes" | null | null | null | null | null | null | null | null | "318003502402369" | "SA24" | "Organisme de Sécurité Sociale" |
| 1627221 | "10000536309" | "18003502402369" | "18003502402369" | "R10100000049799" | 2 | "DRSM PAYS DE LOIREELSM 49" | "SITE CHOLET" | null | null | "2" | null | "R" | "Rue" | "SAINT ELOI" | "BP" | "49321 CHOLET" | "49321" | "49099" | "Cholet" | null | null | null | null | null | null | null | null | "318003502402369" | "SA24" | "Organisme de Sécurité Sociale" |
| 816501 | "10000667187" | "18003502402369" | "18003502402369" | "R10100000049794" | 3 | "DRSM NORD PICARDIEELSM 59" | "SITE MAUBEUGE" | null | null | null | null | "PL" | "Place" | "DE WATTIGNIES" | "BP" | "59603 MAUBEUGE" | "59603" | "59392" | "Maubeuge" | null | null | null | null | null | null | null | null | "318003502402369" | "SA24" | "Organisme de Sécurité Sociale" |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| 532248 | "10110592168" | "88085935000014" | "88085935000014" | "R10100000325887" | 2 | "FAREVA PAU" | null | null | null | null | null | "AV" | "Avenue" | "DU BEARN" | null | "64320 IDRON" | "64320" | "64269" | "Idron" | null | null | "0559402100" | null | "0559402119" | null | null | null | "388085935000014" | "SA32" | "Fab. Exploit. Import. Méd. DM" |
| 1344743 | "10110597498" | "13000800600038" | "13000800600038" | "R10100000097229" | 2 | "AGENCE REGIONALE SANTE PAYS LO… | null | null | null | "17" | null | "BD" | "Boulevard" | "GASTON DOUMERGUE" | null | "44262 NANTES" | "44262" | "44109" | "Nantes" | null | null | null | null | null | null | null | null | "313000800600038" | "SA24" | "Organisme de Sécurité Sociale" |
| 2156205 | "10110597498" | "13000800600038" | "13000800600038" | "R10100000097229" | 2 | "AGENCE REGIONALE SANTE PAYS LO… | null | "DELEGATION TERRITORIALE" | null | "2" | null | "BD" | "Boulevard" | "MURAT" | null | "53000 LAVAL" | "53000" | "53130" | "Laval" | null | null | null | null | null | null | null | null | "313000800600038" | "SA24" | "Organisme de Sécurité Sociale" |
| 1618789 | "10110910345" | "18003502401098" | "18003502401098" | "R10100000398898" | 2 | "CAISSE NATIONALE DE L'ASSURANC… | "DRSM DIRECTION REG. DU SERVICE… | "QUARTIER DU LAC" | null | "80" | null | null | null | "AVENUE DE LA JALLERE" | "BP 260" | "33300 BORDEAUX" | "33300" | "33063" | "Bordeaux" | "99000" | "France" | null | null | null | null | null | null | "318003502401098" | "SA24" | "Organisme de Sécurité Sociale" |
| 1889827 | "10110910345" | "18003502401098" | "18003502401098" | "R10100000398898" | 2 | "CAISSE NATIONALE DE L'ASSURANC… | "DRSM DIRECTION REG. DU SERVICE… | null | null | "207" | null | "R" | "Rue" | "FONTAINEBLEAU" | "BP" | "40011 MONT-DE-MARSAN" | "40011" | "40192" | "Mont-de-Marsan" | null | null | null | null | null | null | null | null | "318003502401098" | "SA24" | "Organisme de Sécurité Sociale" |
"
+ ],
"text/plain": [
"shape: (98, 31)\n",
"┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n",
@@ -501,16 +475,6 @@
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n",
"└─────────┴────────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘"
- ],
- "text/html": [
- "\n",
- "
shape: (98, 31)| index | Identifiant PP | Site_Identifier | Numéro SIRET site | Identifiant technique de la structure | Site_Info_Variations_Count | Raison sociale site | Enseigne commerciale site | Complément destinataire (coord. structure) | Complément point géographique (coord. structure) | Numéro Voie (coord. structure) | Indice répétition voie (coord. structure) | Code type de voie (coord. structure) | Libellé type de voie (coord. structure) | Libellé Voie (coord. structure) | Mention distribution (coord. structure) | Bureau cedex (coord. structure) | Code postal (coord. structure) | Code commune (coord. structure) | Libellé commune (coord. structure) | Code pays (coord. structure) | Libellé pays (coord. structure) | Téléphone (coord. structure) | Téléphone 2 (coord. structure) | Télécopie (coord. structure) | Adresse e-mail (coord. structure) | Code Département (structure) | Libellé Département (structure) | Ancien identifiant de la structure | Code secteur d'activité | Libellé secteur d'activité |
|---|
| u32 | str | str | str | str | u32 | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str |
| 270597 | "10000116383" | "30980220500505" | "30980220500505" | "R10100000198782" | 2 | "OEUVRES HOSPITALIERES FRANCAIS… | "ORDRE DE MALTE FRANCE" | "ORDRE DE MALTE FRANCE" | null | "42" | null | null | null | "RUE DES VOLONTAIRES" | null | "75015 PARIS" | "75015" | "75056" | "Paris" | "99000" | "France" | null | null | null | null | null | null | "330980220500505" | "SA28" | "Asso et orga humanitaire" |
| 1353470 | "10000116383" | "30980220500505" | "30980220500505" | "R10100000779807" | 2 | "OEUVRE HOSP FRANC DE L'ORDRE D… | null | null | null | "49" | null | "R" | "Rue" | "DE LA CHAPELLE" | null | "75018 PARIS 18E ARRONDISSEMEN… | "75018" | "75118" | "Paris 18e Arrondissement" | null | null | null | null | null | null | null | null | "330980220500505" | "SA28" | "Asso et orga humanitaire" |
| 4214 | "10000536309" | "18003502402369" | "18003502402369" | "R10100000050224" | 2 | "DRSM PAYS DE LA LOIREELSM 44" | "SITE NANTES" | null | null | "9" | null | "R" | "Rue" | "GAETAN RONDEAU" | "BP" | "44203 NANTES" | "44203" | "44109" | "Nantes" | null | null | null | null | null | null | null | null | "318003502402369" | "SA24" | "Organisme de Sécurité Sociale" |
| 1627221 | "10000536309" | "18003502402369" | "18003502402369" | "R10100000049799" | 2 | "DRSM PAYS DE LOIREELSM 49" | "SITE CHOLET" | null | null | "2" | null | "R" | "Rue" | "SAINT ELOI" | "BP" | "49321 CHOLET" | "49321" | "49099" | "Cholet" | null | null | null | null | null | null | null | null | "318003502402369" | "SA24" | "Organisme de Sécurité Sociale" |
| 816501 | "10000667187" | "18003502402369" | "18003502402369" | "R10100000049794" | 3 | "DRSM NORD PICARDIEELSM 59" | "SITE MAUBEUGE" | null | null | null | null | "PL" | "Place" | "DE WATTIGNIES" | "BP" | "59603 MAUBEUGE" | "59603" | "59392" | "Maubeuge" | null | null | null | null | null | null | null | null | "318003502402369" | "SA24" | "Organisme de Sécurité Sociale" |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| 532248 | "10110592168" | "88085935000014" | "88085935000014" | "R10100000325887" | 2 | "FAREVA PAU" | null | null | null | null | null | "AV" | "Avenue" | "DU BEARN" | null | "64320 IDRON" | "64320" | "64269" | "Idron" | null | null | "0559402100" | null | "0559402119" | null | null | null | "388085935000014" | "SA32" | "Fab. Exploit. Import. Méd. DM" |
| 1344743 | "10110597498" | "13000800600038" | "13000800600038" | "R10100000097229" | 2 | "AGENCE REGIONALE SANTE PAYS LO… | null | null | null | "17" | null | "BD" | "Boulevard" | "GASTON DOUMERGUE" | null | "44262 NANTES" | "44262" | "44109" | "Nantes" | null | null | null | null | null | null | null | null | "313000800600038" | "SA24" | "Organisme de Sécurité Sociale" |
| 2156205 | "10110597498" | "13000800600038" | "13000800600038" | "R10100000097229" | 2 | "AGENCE REGIONALE SANTE PAYS LO… | null | "DELEGATION TERRITORIALE" | null | "2" | null | "BD" | "Boulevard" | "MURAT" | null | "53000 LAVAL" | "53000" | "53130" | "Laval" | null | null | null | null | null | null | null | null | "313000800600038" | "SA24" | "Organisme de Sécurité Sociale" |
| 1618789 | "10110910345" | "18003502401098" | "18003502401098" | "R10100000398898" | 2 | "CAISSE NATIONALE DE L'ASSURANC… | "DRSM DIRECTION REG. DU SERVICE… | "QUARTIER DU LAC" | null | "80" | null | null | null | "AVENUE DE LA JALLERE" | "BP 260" | "33300 BORDEAUX" | "33300" | "33063" | "Bordeaux" | "99000" | "France" | null | null | null | null | null | null | "318003502401098" | "SA24" | "Organisme de Sécurité Sociale" |
| 1889827 | "10110910345" | "18003502401098" | "18003502401098" | "R10100000398898" | 2 | "CAISSE NATIONALE DE L'ASSURANC… | "DRSM DIRECTION REG. DU SERVICE… | null | null | "207" | null | "R" | "Rue" | "FONTAINEBLEAU" | "BP" | "40011 MONT-DE-MARSAN" | "40011" | "40192" | "Mont-de-Marsan" | null | null | null | null | null | null | null | null | "318003502401098" | "SA24" | "Organisme de Sécurité Sociale" |
"
]
},
"execution_count": 64,
@@ -518,37 +482,18 @@
"output_type": "execute_result"
}
],
- "execution_count": 64
- },
- {
- "metadata": {
- "ExecuteTime": {
- "end_time": "2025-08-19T22:46:07.974271Z",
- "start_time": "2025-08-19T22:46:07.943280Z"
- }
- },
- "cell_type": "code",
- "source": [
- "df4.write_csv(f\"{folder}{output_file}-Sites_Variations{output_extension}\",\n",
- " separator='|',\n",
- " quote_style=\"never\",\n",
- " line_terminator='\\n')\n"
- ],
- "id": "c1fd01e419f4ccc9",
- "outputs": [],
- "execution_count": 65
- },
- {
- "metadata": {
- "ExecuteTime": {
- "end_time": "2025-08-19T22:46:12.781888Z",
- "start_time": "2025-08-19T22:46:08.306776Z"
- }
- },
- "cell_type": "code",
"source": [
"%%time\n",
- "df5 = (\n",
+ "all_columns = df.columns\n",
+ "start_col = 'Raison sociale site'\n",
+ "end_col = \"Libellé secteur d'activité\"\n",
+ "start_col_index = all_columns.index(start_col)\n",
+ "end_col_index = all_columns.index(end_col)\n",
+ "site_info_cols = all_columns[start_col_index : end_col_index + 1]\n",
+ "if \"Autorité d'enregistrement\" in site_info_cols:\n",
+ " site_info_cols.remove(\"Autorité d'enregistrement\")\n",
+ "\n",
+ "df4 = (\n",
" df\n",
" .filter(pd.col('Numéro FINESS site').is_null())\n",
" .filter(\n",
@@ -561,10 +506,10 @@
" ).alias('Site_Identifier')\n",
" )\n",
" .with_columns(\n",
- " pd.struct(site_info_cols).n_unique().over(['Site_Identifier']).alias('Site_Info_Variations_Count')\n",
+ " pd.struct(site_info_cols).n_unique().over(['Identifiant PP', 'Site_Identifier']).alias('Site_Info_Variations_Count')\n",
" )\n",
" .filter(pd.col('Site_Info_Variations_Count') > 1)\n",
- " .sort(['Site_Identifier', 'index'])\n",
+ " .sort(['Identifiant PP', 'Site_Identifier', 'index'])\n",
" .select(['index', \n",
" 'Identifiant PP', \n",
" 'Site_Identifier', \n",
@@ -572,9 +517,37 @@
" 'Identifiant technique de la structure', \n",
" 'Site_Info_Variations_Count'] + site_info_cols)\n",
")\n",
- "df5\n"
- ],
+ "df4"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "id": "c1fd01e419f4ccc9",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-08-19T22:46:07.974271Z",
+ "start_time": "2025-08-19T22:46:07.943280Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "df4.write_csv(f\"{folder}{output_file}-Sites_Variations{output_extension}\",\n",
+ " separator='|',\n",
+ " quote_style=\"never\",\n",
+ " line_terminator='\\n')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
"id": "7838523925fc85ee",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-08-19T22:46:12.781888Z",
+ "start_time": "2025-08-19T22:46:08.306776Z"
+ }
+ },
"outputs": [
{
"name": "stdout",
@@ -586,6 +559,16 @@
},
{
"data": {
+ "text/html": [
+ "\n",
+ "
shape: (4_190, 31)| index | Identifiant PP | Site_Identifier | Numéro SIRET site | Identifiant technique de la structure | Site_Info_Variations_Count | Raison sociale site | Enseigne commerciale site | Complément destinataire (coord. structure) | Complément point géographique (coord. structure) | Numéro Voie (coord. structure) | Indice répétition voie (coord. structure) | Code type de voie (coord. structure) | Libellé type de voie (coord. structure) | Libellé Voie (coord. structure) | Mention distribution (coord. structure) | Bureau cedex (coord. structure) | Code postal (coord. structure) | Code commune (coord. structure) | Libellé commune (coord. structure) | Code pays (coord. structure) | Libellé pays (coord. structure) | Téléphone (coord. structure) | Téléphone 2 (coord. structure) | Télécopie (coord. structure) | Adresse e-mail (coord. structure) | Code Département (structure) | Libellé Département (structure) | Ancien identifiant de la structure | Code secteur d'activité | Libellé secteur d'activité |
|---|
| u32 | str | str | str | str | u32 | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str |
| 127508 | "10100002293" | "05650171100115" | "05650171100115" | "R10000001502146" | 2 | "BECTON DICKINSON FRANCE" | null | null | null | "11" | null | "R" | "Rue" | "RUE ARISTIDE BERGES" | null | "38801 LE PONT DE CLAIX CEDEX" | "38801" | "38317" | "Le Pont-de-Claix" | "99000" | "France" | null | null | null | null | null | null | "305650171100115" | "SA32" | "Fab. Exploit. Import. Méd. DM" |
| 285182 | "10001806768" | "05650171100115" | "05650171100115" | "R10000001502146" | 2 | "BECTON DICKINSON FRANCE" | null | null | null | "11" | null | "R" | "Rue" | "RUE ARISTIDE BERGES" | null | "38801 LE PONT DE CLAIX CEDEX" | "38801" | "38317" | "Le Pont-de-Claix" | "99000" | "France" | null | null | null | null | null | null | "305650171100115" | "SA32" | "Fab. Exploit. Import. Méd. DM" |
| 466100 | "10104800411" | "05650171100115" | "05650171100115" | "R10000001502146" | 2 | "BECTON DICKINSON FRANCE" | null | null | null | "11" | null | "R" | "Rue" | "RUE ARISTIDE BERGES" | null | "38801 LE PONT DE CLAIX CEDEX" | "38801" | "38317" | "Le Pont-de-Claix" | "99000" | "France" | null | null | null | null | null | null | "305650171100115" | "SA32" | "Fab. Exploit. Import. Méd. DM" |
| 722626 | "10103687157" | "05650171100115" | "05650171100115" | "R10000001502146" | 2 | "BECTON DICKINSON FRANCE" | null | null | null | "11" | null | "R" | "Rue" | "RUE ARISTIDE BERGES" | null | "38801 LE PONT DE CLAIX CEDEX" | "38801" | "38317" | "Le Pont-de-Claix" | "99000" | "France" | null | null | null | null | null | null | "305650171100115" | "SA32" | "Fab. Exploit. Import. Méd. DM" |
| 826390 | "10001796597" | "05650171100115" | "05650171100115" | "R10000001502146" | 2 | "BECTON DICKINSON FRANCE" | null | null | null | "11" | null | "R" | "Rue" | "RUE ARISTIDE BERGES" | null | "38801 LE PONT DE CLAIX CEDEX" | "38801" | "38317" | "Le Pont-de-Claix" | "99000" | "France" | null | null | null | null | null | null | "305650171100115" | "SA32" | "Fab. Exploit. Import. Méd. DM" |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| 793647 | "10109869403" | "98452619400019" | "98452619400019" | "R10100000673943" | 2 | "SELARL CENTRE DE PODOLOGIE SPO… | "SELARL CENTRE DE PODOLOGIE SPO… | null | null | "8" | null | "R" | "Rue" | "GEORGES NEGREVERGNE" | null | "33700 MERIGNAC" | "33700" | "33281" | "Mérignac" | "99000" | "France" | null | null | null | null | null | null | "398452619400019" | "SA09" | "Exercice en Société" |
| 1500531 | "10101293263" | "98452619400019" | "98452619400019" | "R10100000673943" | 2 | "SELARL CENTRE DE PODOLOGIE SPO… | "SELARL CENTRE DE PODOLOGIE SPO… | null | null | "8" | null | "R" | "Rue" | "GEORGES NEGREVERGNE" | null | "33700 MERIGNAC" | "33700" | "33281" | "Mérignac" | "99000" | "France" | null | null | null | null | null | null | "398452619400019" | "SA09" | "Exercice en Société" |
| 1852947 | "10108015131" | "98452619400019" | "98452619400019" | "R10100000673943" | 2 | "SELARL CENTRE DE PODOLOGIE SPO… | "SELARL CENTRE DE PODOLOGIE SPO… | null | null | "8" | null | "R" | "Rue" | "GEORGES NEGREVERGNE" | null | "33700 MERIGNAC" | "33700" | "33281" | "Mérignac" | "99000" | "France" | null | null | null | null | null | null | "398452619400019" | "SA09" | "Exercice en Société" |
| 1571945 | "10107235243" | "99882350430834" | "99882350430834" | "R10100000554688" | 2 | "ADECCO FRANCE" | "ADECCO" | null | "PARC VALMY PARK AVENUE BAT A 1… | "8" | "D" | null | null | "RUE JEANNE BARRET" | null | "21000 DIJON" | "21000" | "21231" | "Dijon" | "99000" | "France" | null | null | null | null | null | null | "399882350430834" | "SA11" | "Entreprise d'intérim" |
| 1590681 | "10108693036" | "99882350430834" | "99882350430834" | "R10100000413248" | 2 | "ADECCO MEDICAL" | null | null | null | "8" | "D" | "R" | "Rue" | "JEANNE BARRET" | null | "21000 DIJON" | "21000" | "21231" | "Dijon" | "99000" | "France" | null | null | null | null | null | null | "399882350430834" | "SA11" | "Entreprise d'intérim" |
"
+ ],
"text/plain": [
"shape: (4_190, 31)\n",
"┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n",
@@ -633,16 +616,6 @@
"│ ┆ 6 ┆ 30834 ┆ 30834 ┆ ┆ ┆ 430834 ┆ ┆ e │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ d'intérim │\n",
"└─────────┴────────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘"
- ],
- "text/html": [
- "\n",
- "
shape: (4_190, 31)| index | Identifiant PP | Site_Identifier | Numéro SIRET site | Identifiant technique de la structure | Site_Info_Variations_Count | Raison sociale site | Enseigne commerciale site | Complément destinataire (coord. structure) | Complément point géographique (coord. structure) | Numéro Voie (coord. structure) | Indice répétition voie (coord. structure) | Code type de voie (coord. structure) | Libellé type de voie (coord. structure) | Libellé Voie (coord. structure) | Mention distribution (coord. structure) | Bureau cedex (coord. structure) | Code postal (coord. structure) | Code commune (coord. structure) | Libellé commune (coord. structure) | Code pays (coord. structure) | Libellé pays (coord. structure) | Téléphone (coord. structure) | Téléphone 2 (coord. structure) | Télécopie (coord. structure) | Adresse e-mail (coord. structure) | Code Département (structure) | Libellé Département (structure) | Ancien identifiant de la structure | Code secteur d'activité | Libellé secteur d'activité |
|---|
| u32 | str | str | str | str | u32 | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str |
| 127508 | "10100002293" | "05650171100115" | "05650171100115" | "R10000001502146" | 2 | "BECTON DICKINSON FRANCE" | null | null | null | "11" | null | "R" | "Rue" | "RUE ARISTIDE BERGES" | null | "38801 LE PONT DE CLAIX CEDEX" | "38801" | "38317" | "Le Pont-de-Claix" | "99000" | "France" | null | null | null | null | null | null | "305650171100115" | "SA32" | "Fab. Exploit. Import. Méd. DM" |
| 285182 | "10001806768" | "05650171100115" | "05650171100115" | "R10000001502146" | 2 | "BECTON DICKINSON FRANCE" | null | null | null | "11" | null | "R" | "Rue" | "RUE ARISTIDE BERGES" | null | "38801 LE PONT DE CLAIX CEDEX" | "38801" | "38317" | "Le Pont-de-Claix" | "99000" | "France" | null | null | null | null | null | null | "305650171100115" | "SA32" | "Fab. Exploit. Import. Méd. DM" |
| 466100 | "10104800411" | "05650171100115" | "05650171100115" | "R10000001502146" | 2 | "BECTON DICKINSON FRANCE" | null | null | null | "11" | null | "R" | "Rue" | "RUE ARISTIDE BERGES" | null | "38801 LE PONT DE CLAIX CEDEX" | "38801" | "38317" | "Le Pont-de-Claix" | "99000" | "France" | null | null | null | null | null | null | "305650171100115" | "SA32" | "Fab. Exploit. Import. Méd. DM" |
| 722626 | "10103687157" | "05650171100115" | "05650171100115" | "R10000001502146" | 2 | "BECTON DICKINSON FRANCE" | null | null | null | "11" | null | "R" | "Rue" | "RUE ARISTIDE BERGES" | null | "38801 LE PONT DE CLAIX CEDEX" | "38801" | "38317" | "Le Pont-de-Claix" | "99000" | "France" | null | null | null | null | null | null | "305650171100115" | "SA32" | "Fab. Exploit. Import. Méd. DM" |
| 826390 | "10001796597" | "05650171100115" | "05650171100115" | "R10000001502146" | 2 | "BECTON DICKINSON FRANCE" | null | null | null | "11" | null | "R" | "Rue" | "RUE ARISTIDE BERGES" | null | "38801 LE PONT DE CLAIX CEDEX" | "38801" | "38317" | "Le Pont-de-Claix" | "99000" | "France" | null | null | null | null | null | null | "305650171100115" | "SA32" | "Fab. Exploit. Import. Méd. DM" |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| 793647 | "10109869403" | "98452619400019" | "98452619400019" | "R10100000673943" | 2 | "SELARL CENTRE DE PODOLOGIE SPO… | "SELARL CENTRE DE PODOLOGIE SPO… | null | null | "8" | null | "R" | "Rue" | "GEORGES NEGREVERGNE" | null | "33700 MERIGNAC" | "33700" | "33281" | "Mérignac" | "99000" | "France" | null | null | null | null | null | null | "398452619400019" | "SA09" | "Exercice en Société" |
| 1500531 | "10101293263" | "98452619400019" | "98452619400019" | "R10100000673943" | 2 | "SELARL CENTRE DE PODOLOGIE SPO… | "SELARL CENTRE DE PODOLOGIE SPO… | null | null | "8" | null | "R" | "Rue" | "GEORGES NEGREVERGNE" | null | "33700 MERIGNAC" | "33700" | "33281" | "Mérignac" | "99000" | "France" | null | null | null | null | null | null | "398452619400019" | "SA09" | "Exercice en Société" |
| 1852947 | "10108015131" | "98452619400019" | "98452619400019" | "R10100000673943" | 2 | "SELARL CENTRE DE PODOLOGIE SPO… | "SELARL CENTRE DE PODOLOGIE SPO… | null | null | "8" | null | "R" | "Rue" | "GEORGES NEGREVERGNE" | null | "33700 MERIGNAC" | "33700" | "33281" | "Mérignac" | "99000" | "France" | null | null | null | null | null | null | "398452619400019" | "SA09" | "Exercice en Société" |
| 1571945 | "10107235243" | "99882350430834" | "99882350430834" | "R10100000554688" | 2 | "ADECCO FRANCE" | "ADECCO" | null | "PARC VALMY PARK AVENUE BAT A 1… | "8" | "D" | null | null | "RUE JEANNE BARRET" | null | "21000 DIJON" | "21000" | "21231" | "Dijon" | "99000" | "France" | null | null | null | null | null | null | "399882350430834" | "SA11" | "Entreprise d'intérim" |
| 1590681 | "10108693036" | "99882350430834" | "99882350430834" | "R10100000413248" | 2 | "ADECCO MEDICAL" | null | null | null | "8" | "D" | "R" | "Rue" | "JEANNE BARRET" | null | "21000 DIJON" | "21000" | "21231" | "Dijon" | "99000" | "France" | null | null | null | null | null | null | "399882350430834" | "SA11" | "Entreprise d'intérim" |
"
]
},
"execution_count": 66,
@@ -650,75 +623,63 @@
"output_type": "execute_result"
}
],
- "execution_count": 66
+ "source": [
+ "%%time\n",
+ "df5 = (\n",
+ " df\n",
+ " .filter(pd.col('Numéro FINESS site').is_null())\n",
+ " .filter(\n",
+ " pd.col('Numéro SIRET site').is_not_null() | pd.col('Identifiant technique de la structure').is_not_null()\n",
+ " )\n",
+ " .with_columns(\n",
+ " pd.coalesce(\n",
+ " pd.col('Numéro SIRET site'),\n",
+ " pd.col('Identifiant technique de la structure')\n",
+ " ).alias('Site_Identifier')\n",
+ " )\n",
+ " .with_columns(\n",
+ " pd.struct(site_info_cols).n_unique().over(['Site_Identifier']).alias('Site_Info_Variations_Count')\n",
+ " )\n",
+ " .filter(pd.col('Site_Info_Variations_Count') > 1)\n",
+ " .sort(['Site_Identifier', 'index'])\n",
+ " .select(['index', \n",
+ " 'Identifiant PP', \n",
+ " 'Site_Identifier', \n",
+ " 'Numéro SIRET site', \n",
+ " 'Identifiant technique de la structure', \n",
+ " 'Site_Info_Variations_Count'] + site_info_cols)\n",
+ ")\n",
+ "df5\n"
+ ]
},
{
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "416184f32f973a71",
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T22:46:13.670911Z",
"start_time": "2025-08-19T22:46:13.655386Z"
}
},
- "cell_type": "code",
+ "outputs": [],
"source": [
"df5.write_csv(f\"{folder}{output_file}-Sites_Variations_Global{output_extension}\",\n",
" separator='|',\n",
" quote_style=\"never\",\n",
" line_terminator='\\n')\n"
- ],
- "id": "416184f32f973a71",
- "outputs": [],
- "execution_count": 67
+ ]
},
{
+ "cell_type": "code",
+ "execution_count": 68,
+ "id": "84549f83ce5e92f",
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T22:46:17.023811Z",
"start_time": "2025-08-19T22:46:14.032470Z"
}
},
- "cell_type": "code",
- "source": [
- "%%time\n",
- "df6 = (\n",
- " df\n",
- " .with_columns(\n",
- " pd.coalesce(\n",
- " pd.col('Numéro FINESS site'),\n",
- " pd.col('Numéro SIRET site'),\n",
- " pd.col('Identifiant technique de la structure')\n",
- " ).alias('Site_Identifier_Global')\n",
- " )\n",
- " .filter(pd.col('Site_Identifier_Global').is_not_null())\n",
- " .with_columns(\n",
- " pd.struct([\n",
- " \"Libellé profession\",\n",
- " \"Libellé savoir-faire\",\n",
- " \"Libellé mode exercice\",\n",
- " \"Libellé rôle\",\n",
- " \"Libellé genre activité\"\n",
- " ]).n_unique().over([\"Identifiant PP\", \"Site_Identifier_Global\"]).alias(\"Activites_Count\")\n",
- " )\n",
- " .filter(pd.col(\"Activites_Count\") > 1)\n",
- " .sort([\"Identifiant PP\", \"Site_Identifier_Global\", \"index\"])\n",
- " .select([\n",
- " \"index\",\n",
- " \"Identifiant PP\",\n",
- " \"Site_Identifier_Global\",\n",
- " \"Numéro FINESS site\",\n",
- " \"Numéro SIRET site\",\n",
- " \"Identifiant technique de la structure\",\n",
- " \"Activites_Count\",\n",
- " \"Libellé profession\",\n",
- " \"Libellé savoir-faire\",\n",
- " \"Libellé mode exercice\",\n",
- " \"Libellé rôle\",\n",
- " \"Libellé genre activité\"\n",
- " ])\n",
- ")\n",
- "df6\n"
- ],
- "id": "84549f83ce5e92f",
"outputs": [
{
"name": "stdout",
@@ -730,6 +691,16 @@
},
{
"data": {
+ "text/html": [
+ "\n",
+ "
shape: (25_389, 12)| index | Identifiant PP | Site_Identifier_Global | Numéro FINESS site | Numéro SIRET site | Identifiant technique de la structure | Activites_Count | Libellé profession | Libellé savoir-faire | Libellé mode exercice | Libellé rôle | Libellé genre activité |
|---|
| u32 | str | str | str | str | str | u32 | str | str | str | str | str |
| 1352840 | "10000017979" | "130786445" | "130786445" | "30247736900011" | "F130786445" | 2 | "Ostéopathe" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| 1893982 | "10000017979" | "130786445" | "130786445" | "30247736900011" | "F130786445" | 2 | "Sage-Femme" | null | "Salarié" | "Fonction non définie" | "Activité standard de soin ou d… |
| 270269 | "10000070283" | "39784090100011" | null | "39784090100011" | "R10000002500225" | 2 | "Chirurgien-Dentiste" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| 811380 | "10000070283" | "39784090100011" | null | "39784090100011" | "R10000002500225" | 2 | "Chirurgien-Dentiste" | null | "Salarié" | "Fonction non définie" | "Activité non soignante" |
| 540586 | "10000086842" | "860012228" | "860012228" | "13001256000038" | "F860012228" | 2 | "Psychologue" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| … | … | … | … | … | … | … | … | … | … | … | … |
| 538572 | "10111123542" | "970400016" | "970400016" | "26974214400034" | "F970400016" | 2 | "Psychologue" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| 538882 | "10111252994" | "250006954" | "250006954" | "26250176000264" | "F250006954" | 2 | "Psychothérapeute" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| 2162932 | "10111252994" | "250006954" | "250006954" | "26250176000264" | "F250006954" | 2 | "Psychologue" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| 2163446 | "10111293121" | "490540218" | "490540218" | "77568873211159" | "F490540218" | 2 | "Infirmier" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| 2163447 | "10111293121" | "490540218" | "490540218" | "77568873211159" | "F490540218" | 2 | "Infirmier" | null | "Salarié" | "Cadre de santé de proximité" | "Activité non soignante" |
"
+ ],
"text/plain": [
"shape: (25_389, 12)\n",
"┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n",
@@ -779,16 +750,6 @@
"│ ┆ 1 ┆ ┆ ┆ ┆ ┆ ┆ santé de ┆ non │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ proximité ┆ soignante │\n",
"└─────────┴────────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘"
- ],
- "text/html": [
- "\n",
- "
shape: (25_389, 12)| index | Identifiant PP | Site_Identifier_Global | Numéro FINESS site | Numéro SIRET site | Identifiant technique de la structure | Activites_Count | Libellé profession | Libellé savoir-faire | Libellé mode exercice | Libellé rôle | Libellé genre activité |
|---|
| u32 | str | str | str | str | str | u32 | str | str | str | str | str |
| 1352840 | "10000017979" | "130786445" | "130786445" | "30247736900011" | "F130786445" | 2 | "Ostéopathe" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| 1893982 | "10000017979" | "130786445" | "130786445" | "30247736900011" | "F130786445" | 2 | "Sage-Femme" | null | "Salarié" | "Fonction non définie" | "Activité standard de soin ou d… |
| 270269 | "10000070283" | "39784090100011" | null | "39784090100011" | "R10000002500225" | 2 | "Chirurgien-Dentiste" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| 811380 | "10000070283" | "39784090100011" | null | "39784090100011" | "R10000002500225" | 2 | "Chirurgien-Dentiste" | null | "Salarié" | "Fonction non définie" | "Activité non soignante" |
| 540586 | "10000086842" | "860012228" | "860012228" | "13001256000038" | "F860012228" | 2 | "Psychologue" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| … | … | … | … | … | … | … | … | … | … | … | … |
| 538572 | "10111123542" | "970400016" | "970400016" | "26974214400034" | "F970400016" | 2 | "Psychologue" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| 538882 | "10111252994" | "250006954" | "250006954" | "26250176000264" | "F250006954" | 2 | "Psychothérapeute" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| 2162932 | "10111252994" | "250006954" | "250006954" | "26250176000264" | "F250006954" | 2 | "Psychologue" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| 2163446 | "10111293121" | "490540218" | "490540218" | "77568873211159" | "F490540218" | 2 | "Infirmier" | null | "Salarié" | "Salarié en poste fixe" | "Activité standard de soin ou d… |
| 2163447 | "10111293121" | "490540218" | "490540218" | "77568873211159" | "F490540218" | 2 | "Infirmier" | null | "Salarié" | "Cadre de santé de proximité" | "Activité non soignante" |
"
]
},
"execution_count": 68,
@@ -796,54 +757,75 @@
"output_type": "execute_result"
}
],
- "execution_count": 68
+ "source": [
+ "%%time\n",
+ "df6 = (\n",
+ " df\n",
+ " .with_columns(\n",
+ " pd.coalesce(\n",
+ " pd.col('Numéro FINESS site'),\n",
+ " pd.col('Numéro SIRET site'),\n",
+ " pd.col('Identifiant technique de la structure')\n",
+ " ).alias('Site_Identifier_Global')\n",
+ " )\n",
+ " .filter(pd.col('Site_Identifier_Global').is_not_null())\n",
+ " .with_columns(\n",
+ " pd.struct([\n",
+ " \"Libellé profession\",\n",
+ " \"Libellé savoir-faire\",\n",
+ " \"Libellé mode exercice\",\n",
+ " \"Libellé rôle\",\n",
+ " \"Libellé genre activité\"\n",
+ " ]).n_unique().over([\"Identifiant PP\", \"Site_Identifier_Global\"]).alias(\"Activites_Count\")\n",
+ " )\n",
+ " .filter(pd.col(\"Activites_Count\") > 1)\n",
+ " .sort([\"Identifiant PP\", \"Site_Identifier_Global\", \"index\"])\n",
+ " .select([\n",
+ " \"index\",\n",
+ " \"Identifiant PP\",\n",
+ " \"Site_Identifier_Global\",\n",
+ " \"Numéro FINESS site\",\n",
+ " \"Numéro SIRET site\",\n",
+ " \"Identifiant technique de la structure\",\n",
+ " \"Activites_Count\",\n",
+ " \"Libellé profession\",\n",
+ " \"Libellé savoir-faire\",\n",
+ " \"Libellé mode exercice\",\n",
+ " \"Libellé rôle\",\n",
+ " \"Libellé genre activité\"\n",
+ " ])\n",
+ ")\n",
+ "df6\n"
+ ]
},
{
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "6f7025a7c08b54b4",
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T22:46:17.851427Z",
"start_time": "2025-08-19T22:46:17.796168Z"
}
},
- "cell_type": "code",
+ "outputs": [],
"source": [
"df6.write_csv(f\"{folder}{output_file}-Multiple_Activities_Per_Site{output_extension}\",\n",
" separator='|',\n",
" quote_style=\"never\",\n",
" line_terminator='\\n')\n"
- ],
- "id": "6f7025a7c08b54b4",
- "outputs": [],
- "execution_count": 69
+ ]
},
{
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "b18d9ba71ba63d9d",
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T22:46:19.535052Z",
"start_time": "2025-08-19T22:46:18.015194Z"
}
},
- "cell_type": "code",
- "source": [
- "%%time\n",
- "df7 = (\n",
- " df\n",
- " .with_columns(\n",
- " pd.col(\"Libellé profession\").n_unique().over(\"Identifiant PP\").alias(\"Profession_Count\")\n",
- " )\n",
- " .filter(pd.col(\"Profession_Count\") > 1)\n",
- " .sort([\"Identifiant PP\", \"index\"])\n",
- " .select([\n",
- " \"index\",\n",
- " \"Identifiant PP\",\n",
- " \"Profession_Count\",\n",
- " \"Libellé profession\",\n",
- " \"Libellé savoir-faire\"\n",
- " ])\n",
- ")\n",
- "df7\n"
- ],
- "id": "b18d9ba71ba63d9d",
"outputs": [
{
"name": "stdout",
@@ -855,6 +837,16 @@
},
{
"data": {
+ "text/html": [
+ "\n",
+ "
shape: (88_845, 5)| index | Identifiant PP | Profession_Count | Libellé profession | Libellé savoir-faire |
|---|
| u32 | str | u32 | str | str |
| 74 | "10000013150" | 2 | "Médecin" | "Psychiatrie" |
| 269913 | "10000013150" | 2 | "Psychothérapeute" | null |
| 1352840 | "10000017979" | 2 | "Ostéopathe" | null |
| 1893982 | "10000017979" | 2 | "Sage-Femme" | null |
| 811125 | "10000029966" | 2 | "Sage-Femme" | null |
| … | … | … | … | … |
| 1352396 | "10111320304" | 2 | "Psychothérapeute" | null |
| 269545 | "10111320379" | 2 | "Psychothérapeute" | null |
| 810601 | "10111320379" | 2 | "Psychologue" | null |
| 539711 | "10111321468" | 2 | "Psychothérapeute" | null |
| 1352414 | "10111321468" | 2 | "Psychologue" | null |
"
+ ],
"text/plain": [
"shape: (88_845, 5)\n",
"┌─────────┬────────────────┬──────────────────┬────────────────────┬──────────────────────┐\n",
@@ -874,16 +866,6 @@
"│ 539711 ┆ 10111321468 ┆ 2 ┆ Psychothérapeute ┆ null │\n",
"│ 1352414 ┆ 10111321468 ┆ 2 ┆ Psychologue ┆ null │\n",
"└─────────┴────────────────┴──────────────────┴────────────────────┴──────────────────────┘"
- ],
- "text/html": [
- "\n",
- "
shape: (88_845, 5)| index | Identifiant PP | Profession_Count | Libellé profession | Libellé savoir-faire |
|---|
| u32 | str | u32 | str | str |
| 74 | "10000013150" | 2 | "Médecin" | "Psychiatrie" |
| 269913 | "10000013150" | 2 | "Psychothérapeute" | null |
| 1352840 | "10000017979" | 2 | "Ostéopathe" | null |
| 1893982 | "10000017979" | 2 | "Sage-Femme" | null |
| 811125 | "10000029966" | 2 | "Sage-Femme" | null |
| … | … | … | … | … |
| 1352396 | "10111320304" | 2 | "Psychothérapeute" | null |
| 269545 | "10111320379" | 2 | "Psychothérapeute" | null |
| 810601 | "10111320379" | 2 | "Psychologue" | null |
| 539711 | "10111321468" | 2 | "Psychothérapeute" | null |
| 1352414 | "10111321468" | 2 | "Psychologue" | null |
"
]
},
"execution_count": 70,
@@ -891,32 +873,50 @@
"output_type": "execute_result"
}
],
- "execution_count": 70
+ "source": [
+ "%%time\n",
+ "df7 = (\n",
+ " df\n",
+ " .with_columns(\n",
+ " pd.col(\"Libellé profession\").n_unique().over(\"Identifiant PP\").alias(\"Profession_Count\")\n",
+ " )\n",
+ " .filter(pd.col(\"Profession_Count\") > 1)\n",
+ " .sort([\"Identifiant PP\", \"index\"])\n",
+ " .select([\n",
+ " \"index\",\n",
+ " \"Identifiant PP\",\n",
+ " \"Profession_Count\",\n",
+ " \"Libellé profession\",\n",
+ " \"Libellé savoir-faire\"\n",
+ " ])\n",
+ ")\n",
+ "df7\n"
+ ]
},
{
+ "cell_type": "code",
+ "execution_count": 71,
+ "id": "85be468fd3f461d1",
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T22:46:19.893214Z",
"start_time": "2025-08-19T22:46:19.851765Z"
}
},
- "cell_type": "code",
+ "outputs": [],
"source": [
"df7.write_csv(f\"{folder}{output_file}-Multiple_Professions{output_extension}\",\n",
" separator='|',\n",
" quote_style=\"never\",\n",
" line_terminator='\\n')\n"
- ],
- "id": "85be468fd3f461d1",
- "outputs": [],
- "execution_count": 71
+ ]
}
],
"metadata": {
"kernelspec": {
- "name": "python3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
- "display_name": "Python 3 (ipykernel)"
+ "name": "python3"
}
},
"nbformat": 4,
diff --git a/Table_Réf_Professionnels_inconsistencies.xlsx b/Table_Réf_Professionnels_inconsistencies.xlsx
index 675a028..e6a0861 100644
Binary files a/Table_Réf_Professionnels_inconsistencies.xlsx and b/Table_Réf_Professionnels_inconsistencies.xlsx differ