commit 0ae48d63f2690b93529e7737c1ba77aea2fff850 Author: Abdelkouddous LHACHIMI Date: Thu Mar 5 11:11:10 2026 +0000 Initial Commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..220c29d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.rar +*.zip +*.txt +*.csv diff --git a/.ipynb_checkpoints/Professionals_Sort-checkpoint.ipynb b/.ipynb_checkpoints/Professionals_Sort-checkpoint.ipynb new file mode 100644 index 0000000..789d7c7 --- /dev/null +++ b/.ipynb_checkpoints/Professionals_Sort-checkpoint.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import csv\n", + "filename = 'Table_Réf_Professionnels_250430'\n", + "df = pd.read_csv(\"H:\\\\Mon Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\\"+filename+\".csv\", sep='|', doublequote=False, quoting=csv.QUOTE_NONE,dtype=str, na_values='', keep_default_na=False)\n", + "df.columns" + ], + "metadata": { + "collapsed": false, + "jupyter": { + "is_executing": true + } + }, + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "source": [ + "df.shape" + ], + "metadata": { + "collapsed": false + }, + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "source": [ + "df_sorted = df.sort_values([\"Identifiant PP\", \"Nom d'exercice\", \"Prénom d'exercice\", 'Libellé profession', 'Libellé savoir-faire', 'Bureau cedex (coord. structure)', 'Numéro Voie (coord. structure)', 'Indice répétition voie (coord. structure)', 'Libellé type de voie (coord. structure)', 'Libellé Voie (coord. structure)' , 'Mention distribution (coord. structure)', 'Téléphone (coord. structure)'])" + ], + "metadata": { + "collapsed": false + }, + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "source": [ + "df_sorted.to_csv(\"H:\\\\Mon Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\\"+filename+\"-sorted.csv\", sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE, lineterminator='\\n')" + ], + "metadata": { + "collapsed": false + }, + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Professionals.ipynb b/Professionals.ipynb new file mode 100644 index 0000000..ab07fee --- /dev/null +++ b/Professionals.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [], + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": "Index(['Type d'identifiant PP', 'Identifiant PP',\n 'Identification nationale PP', 'Code civilité d'exercice',\n 'Libellé civilité d'exercice', 'Code civilité', 'Libellé civilité',\n 'Nom d'exercice', 'Prénom d'exercice', 'Code profession',\n 'Libellé profession', 'Code catégorie professionnelle',\n 'Libellé catégorie professionnelle', 'Code type savoir-faire',\n 'Libellé type savoir-faire', 'Code savoir-faire',\n 'Libellé savoir-faire', 'Code mode exercice', 'Libellé mode exercice',\n 'Numéro SIRET site', 'Numéro SIREN site', 'Numéro FINESS site',\n 'Numéro FINESS établissement juridique',\n 'Identifiant technique de la structure', 'Raison sociale site',\n 'Enseigne commerciale site',\n 'Complément destinataire (coord. structure)',\n 'Complément point géographique (coord. structure)',\n 'Numéro Voie (coord. structure)',\n 'Indice répétition voie (coord. structure)',\n 'Code type de voie (coord. structure)',\n 'Libellé type de voie (coord. structure)',\n 'Libellé Voie (coord. structure)',\n 'Mention distribution (coord. structure)',\n 'Bureau cedex (coord. structure)', 'Code postal (coord. structure)',\n 'Code commune (coord. structure)', 'Libellé commune (coord. structure)',\n 'Code pays (coord. structure)', 'Libellé pays (coord. structure)',\n 'Téléphone (coord. structure)', 'Téléphone 2 (coord. structure)',\n 'Télécopie (coord. structure)', 'Adresse e-mail (coord. structure)',\n 'Code Département (structure)', 'Libellé Département (structure)',\n 'Ancien identifiant de la structure', 'Autorité d'enregistrement',\n 'Code secteur d'activité', 'Libellé secteur d'activité',\n 'Code section tableau pharmaciens',\n 'Libellé section tableau pharmaciens', 'Unnamed: 52'],\n dtype='object')" + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import csv\n", + "df = pd.read_csv(\"C:\\_temp\\Professionnels\\Table_Réf_Professionnels_220615.csv\", sep='|', doublequote=False, quoting=csv.QUOTE_NONE,\n", + " dtype=str, na_values='', keep_default_na=False)\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": "(864328, 53)" + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": "(843643, 53)" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.drop_duplicates().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": "(812168, 53)" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.drop_duplicates([\"Nom d'exercice\", \"Prénom d'exercice\", 'Libellé profession', 'Libellé savoir-faire', 'Bureau cedex (coord. structure)']).shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": "(814972, 53)" + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.drop_duplicates([\"Nom d'exercice\", \"Prénom d'exercice\", 'Libellé profession', 'Libellé savoir-faire', 'Bureau cedex (coord. structure)', \"Identifiant PP\"]).shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": "(839643, 53)" + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.drop_duplicates([\"Nom d'exercice\", \"Prénom d'exercice\", 'Libellé profession', 'Libellé savoir-faire', 'Bureau cedex (coord. structure)', \"Identifiant PP\", 'Numéro Voie (coord. structure)', 'Indice répétition voie (coord. structure)', 'Libellé type de voie (coord. structure)', 'Libellé Voie (coord. structure)' , 'Mention distribution (coord. structure)']).shape" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/Professionals_Activities_Inconsistencies.ipynb b/Professionals_Activities_Inconsistencies.ipynb new file mode 100644 index 0000000..39dde05 --- /dev/null +++ b/Professionals_Activities_Inconsistencies.ipynb @@ -0,0 +1,924 @@ +{ + "cells": [ + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:45:53.794236Z", + "start_time": "2025-08-19T22:45:51.445477Z" + } + }, + "cell_type": "code", + "source": [ + "%%time\n", + "import polars as pd\n", + "import csv\n", + "folder = \"E:\\\\Ziwig Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\\"\n", + "input_file = \"Table_Réf_Professionnels_250815.txt\"\n", + "output_file = \"Table_Réf_Professionnels_inconsistencies\"\n", + "output_extension = \".csv\"\n", + "df = pd.read_csv(f\"{folder}{input_file}\",\n", + " separator='|',\n", + " quote_char=None,\n", + " null_values='',\n", + " infer_schema_length=0) # Read all columns as strings\n", + "df = df.with_row_index('index')\n", + "df.columns\n" + ], + "id": "58db5082e27759f7", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 13 s\n", + "Wall time: 2.33 s\n" + ] + }, + { + "data": { + "text/plain": [ + "['index',\n", + " \"Type d'identifiant PP\",\n", + " 'Identifiant PP',\n", + " 'Identification nationale PP',\n", + " \"Code civilité d'exercice\",\n", + " \"Libellé civilité d'exercice\",\n", + " 'Code civilité',\n", + " 'Libellé civilité',\n", + " \"Nom d'exercice\",\n", + " \"Prénom d'exercice\",\n", + " 'Code profession',\n", + " 'Libellé profession',\n", + " 'Code catégorie professionnelle',\n", + " 'Libellé catégorie professionnelle',\n", + " 'Code type savoir-faire',\n", + " 'Libellé type savoir-faire',\n", + " 'Code savoir-faire',\n", + " 'Libellé savoir-faire',\n", + " 'Code mode exercice',\n", + " 'Libellé mode exercice',\n", + " 'Numéro SIRET site',\n", + " 'Numéro SIREN site',\n", + " 'Numéro FINESS site',\n", + " 'Numéro FINESS établissement juridique',\n", + " 'Identifiant technique de la structure',\n", + " 'Raison sociale site',\n", + " 'Enseigne commerciale site',\n", + " 'Complément destinataire (coord. structure)',\n", + " 'Complément point géographique (coord. structure)',\n", + " 'Numéro Voie (coord. structure)',\n", + " 'Indice répétition voie (coord. structure)',\n", + " 'Code type de voie (coord. structure)',\n", + " 'Libellé type de voie (coord. structure)',\n", + " 'Libellé Voie (coord. structure)',\n", + " 'Mention distribution (coord. structure)',\n", + " 'Bureau cedex (coord. structure)',\n", + " 'Code postal (coord. structure)',\n", + " 'Code commune (coord. structure)',\n", + " 'Libellé commune (coord. structure)',\n", + " 'Code pays (coord. structure)',\n", + " 'Libellé pays (coord. structure)',\n", + " 'Téléphone (coord. structure)',\n", + " 'Téléphone 2 (coord. structure)',\n", + " 'Télécopie (coord. structure)',\n", + " 'Adresse e-mail (coord. structure)',\n", + " 'Code Département (structure)',\n", + " 'Libellé Département (structure)',\n", + " 'Ancien identifiant de la structure',\n", + " \"Autorité d'enregistrement\",\n", + " \"Code secteur d'activité\",\n", + " \"Libellé secteur d'activité\",\n", + " 'Code section tableau pharmaciens',\n", + " 'Libellé section tableau pharmaciens',\n", + " 'Code rôle',\n", + " 'Libellé rôle',\n", + " 'Code genre activité',\n", + " 'Libellé genre activité',\n", + " '']" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 57 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:45:54.952210Z", + "start_time": "2025-08-19T22:45:53.873718Z" + } + }, + "cell_type": "code", + "source": [ + "%%time\n", + "df1 = (\n", + " df.with_columns(\n", + " (pd.col(\"Nom d'exercice\") + \" \" + pd.col(\"Prénom d'exercice\")).alias(\"Nom_Prénom\")\n", + " )\n", + " .with_columns(\n", + " pd.col(\"Nom_Prénom\").n_unique().over(\"Identifiant PP\").alias(\"Count\")\n", + " )\n", + " .filter(pd.col(\"Count\") > 1)\n", + " .sort([\"Identifiant PP\", \"index\"])\n", + " .select(\"index\", \"Identifiant PP\", \"Count\", \"Nom d'exercice\", \"Prénom d'exercice\")\n", + ")\n", + "df1\n" + ], + "id": "7d9b7562c09955", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 4.83 s\n", + "Wall time: 1.07 s\n" + ] + }, + { + "data": { + "text/plain": [ + "shape: (9_108, 5)\n", + "┌─────────┬────────────────┬───────┬────────────────┬───────────────────┐\n", + "│ index ┆ Identifiant PP ┆ Count ┆ Nom d'exercice ┆ Prénom d'exercice │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ str ┆ u32 ┆ str ┆ str │\n", + "╞═════════╪════════════════╪═══════╪════════════════╪═══════════════════╡\n", + "│ 1352933 ┆ 10000034180 ┆ 2 ┆ DUWAT-GEORGES ┆ GHISLAINE │\n", + "│ 1352934 ┆ 10000034180 ┆ 2 ┆ GEORGES ┆ GHISLAINE │\n", + "│ 261 ┆ 10000040062 ┆ 2 ┆ MEYER ┆ Nicolas │\n", + "│ 262 ┆ 10000040062 ┆ 2 ┆ MEYER ┆ Nicolas │\n", + "│ 811196 ┆ 10000040062 ┆ 2 ┆ MEYER ┆ NICOLAS │\n", + "│ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 2162425 ┆ 10111110721 ┆ 2 ┆ BARREAU ┆ Nadège │\n", + "│ 268237 ┆ 10111112636 ┆ 2 ┆ GIRAUDET ┆ MEGGIE │\n", + "│ 1892318 ┆ 10111112636 ┆ 2 ┆ GIRAUDET ┆ Meggie │\n", + "│ 269544 ┆ 10111320304 ┆ 2 ┆ Sengel ┆ Coralie │\n", + "│ 1352396 ┆ 10111320304 ┆ 2 ┆ SENGEL ┆ Coralie │\n", + "└─────────┴────────────────┴───────┴────────────────┴───────────────────┘" + ], + "text/html": [ + "
\n", + "shape: (9_108, 5)
indexIdentifiant PPCountNom d'exercicePrénom d'exercice
u32stru32strstr
1352933"10000034180"2"DUWAT-GEORGES""GHISLAINE"
1352934"10000034180"2"GEORGES""GHISLAINE"
261"10000040062"2"MEYER""Nicolas"
262"10000040062"2"MEYER""Nicolas"
811196"10000040062"2"MEYER""NICOLAS"
2162425"10111110721"2"BARREAU""Nadège"
268237"10111112636"2"GIRAUDET""MEGGIE"
1892318"10111112636"2"GIRAUDET""Meggie"
269544"10111320304"2"Sengel""Coralie"
1352396"10111320304"2"SENGEL""Coralie"
" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 58 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:45:55.090712Z", + "start_time": "2025-08-19T22:45:55.072647Z" + } + }, + "cell_type": "code", + "source": [ + "df1.write_csv(f\"{folder}{output_file}-Names_Variations_Strict{output_extension}\",\n", + " separator='|',\n", + " quote_style=\"never\",\n", + " line_terminator='\\n')\n" + ], + "id": "c418a6ea7abd77b", + "outputs": [], + "execution_count": 59 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:45:56.962873Z", + "start_time": "2025-08-19T22:45:55.259223Z" + } + }, + "cell_type": "code", + "source": [ + "%%time\n", + "df2 = (\n", + " df.with_columns(\n", + " (pd.col(\"Nom d'exercice\").str.to_lowercase() + \" \" + pd.col(\"Prénom d'exercice\").str.to_lowercase()).alias(\"Nom_Prénom\")\n", + " )\n", + " .with_columns(\n", + " pd.col(\"Nom_Prénom\").n_unique().over(\"Identifiant PP\").alias(\"Count\")\n", + " )\n", + " .filter(pd.col(\"Count\") > 1)\n", + " .sort([\"Identifiant PP\", \"index\"])\n", + " .select(\"index\", \"Identifiant PP\", \"Count\", \"Nom d'exercice\", \"Prénom d'exercice\")\n", + ")\n", + "df2\n" + ], + "id": "9d94b716364356c7", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 5.06 s\n", + "Wall time: 1.68 s\n" + ] + }, + { + "data": { + "text/plain": [ + "shape: (5_426, 5)\n", + "┌─────────┬────────────────┬───────┬────────────────┬───────────────────┐\n", + "│ index ┆ Identifiant PP ┆ Count ┆ Nom d'exercice ┆ Prénom d'exercice │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ str ┆ u32 ┆ str ┆ str │\n", + "╞═════════╪════════════════╪═══════╪════════════════╪═══════════════════╡\n", + "│ 1352933 ┆ 10000034180 ┆ 2 ┆ DUWAT-GEORGES ┆ GHISLAINE │\n", + "│ 1352934 ┆ 10000034180 ┆ 2 ┆ GEORGES ┆ GHISLAINE │\n", + "│ 1353009 ┆ 10000046051 ┆ 2 ┆ STUDER ┆ AGNES │\n", + "│ 1623173 ┆ 10000046051 ┆ 2 ┆ JURION ┆ AGNES │\n", + "│ 270462 ┆ 10000101518 ┆ 2 ┆ BARREYRE ┆ SANDRINE │\n", + "│ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 1619731 ┆ 10110987236 ┆ 2 ┆ ROGIER ┆ MATHILDE │\n", + "│ 808810 ┆ 10111077417 ┆ 2 ┆ DOUVIER ┆ FRANCETTE │\n", + "│ 2161999 ┆ 10111077417 ┆ 2 ┆ D'ELLOY ┆ FRANCETTE │\n", + "│ 538415 ┆ 10111110721 ┆ 2 ┆ ROCHEPEAU ┆ Nadège │\n", + "│ 2162425 ┆ 10111110721 ┆ 2 ┆ BARREAU ┆ Nadège │\n", + "└─────────┴────────────────┴───────┴────────────────┴───────────────────┘" + ], + "text/html": [ + "
\n", + "shape: (5_426, 5)
indexIdentifiant PPCountNom d'exercicePrénom d'exercice
u32stru32strstr
1352933"10000034180"2"DUWAT-GEORGES""GHISLAINE"
1352934"10000034180"2"GEORGES""GHISLAINE"
1353009"10000046051"2"STUDER""AGNES"
1623173"10000046051"2"JURION""AGNES"
270462"10000101518"2"BARREYRE""SANDRINE"
1619731"10110987236"2"ROGIER""MATHILDE"
808810"10111077417"2"DOUVIER""FRANCETTE"
2161999"10111077417"2"D'ELLOY""FRANCETTE"
538415"10111110721"2"ROCHEPEAU""Nadège"
2162425"10111110721"2"BARREAU""Nadège"
" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 60 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:45:57.433036Z", + "start_time": "2025-08-19T22:45:57.417970Z" + } + }, + "cell_type": "code", + "source": [ + "df2.write_csv(f\"{folder}{output_file}-Names_Variations_Insensitive{output_extension}\",\n", + " separator='|',\n", + " quote_style=\"never\",\n", + " line_terminator='\\n')\n" + ], + "id": "18aab4499103491a", + "outputs": [], + "execution_count": 61 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:46:02.915526Z", + "start_time": "2025-08-19T22:45:57.710258Z" + } + }, + "cell_type": "code", + "source": [ + "%%time\n", + "df3 = (\n", + " df\n", + " .with_columns(\n", + " (\n", + " (pd.col(\"Nom d'exercice\") + \" \" + pd.col(\"Prénom d'exercice\"))\n", + " .str.to_lowercase()\n", + " # Normalisation des accents\n", + " .str.replace_all(\"à|á|â|ã|ä|å\", \"a\", literal=False)\n", + " .str.replace_all(\"ç\", \"c\", literal=False)\n", + " .str.replace_all(\"è|é|ê|ë\", \"e\", literal=False)\n", + " .str.replace_all(\"ì|í|î|ï\", \"i\", literal=False)\n", + " .str.replace_all(\"ñ\", \"n\", literal=False)\n", + " .str.replace_all(\"ò|ó|ô|õ|ö\", \"o\", literal=False)\n", + " .str.replace_all(\"ù|ú|û|ü\", \"u\", literal=False)\n", + " .str.replace_all(\"ý|ÿ\", \"y\", literal=False)\n", + " # Remplacement des caractères non-alphanumériques et nettoyage des espaces\n", + " .str.replace_all(r\"[^a-z0-9\\\\s]\", \" \", literal=False)\n", + " .str.replace_all(r\"\\\\s+\", \" \", literal=False)\n", + " .str.strip_chars()\n", + " ).alias(\"Nom_Prénom_Nettoyé\")\n", + " )\n", + " .with_columns(\n", + " pd.col(\"Nom_Prénom_Nettoyé\").n_unique().over(\"Identifiant PP\").alias(\"Count\")\n", + " )\n", + " .filter(pd.col(\"Count\") > 1)\n", + " .sort([\"Identifiant PP\", \"index\"])\n", + " .select(\"index\", \"Identifiant PP\", \"Count\", \"Nom d'exercice\", \"Prénom d'exercice\", \"Nom_Prénom_Nettoyé\")\n", + ")\n", + "df3\n" + ], + "id": "8e4e3e22f16fea1c", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 8.59 s\n", + "Wall time: 5.19 s\n" + ] + }, + { + "data": { + "text/plain": [ + "shape: (3_584, 6)\n", + "┌─────────┬────────────────┬───────┬────────────────┬───────────────────┬─────────────────────────┐\n", + "│ index ┆ Identifiant PP ┆ Count ┆ Nom d'exercice ┆ Prénom d'exercice ┆ Nom_Prénom_Nettoyé │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n", + "╞═════════╪════════════════╪═══════╪════════════════╪═══════════════════╪═════════════════════════╡\n", + "│ 1352933 ┆ 10000034180 ┆ 2 ┆ DUWAT-GEORGES ┆ GHISLAINE ┆ duwat georges ghislaine │\n", + "│ 1352934 ┆ 10000034180 ┆ 2 ┆ GEORGES ┆ GHISLAINE ┆ georges ghislaine │\n", + "│ 1353009 ┆ 10000046051 ┆ 2 ┆ STUDER ┆ AGNES ┆ studer agnes │\n", + "│ 1623173 ┆ 10000046051 ┆ 2 ┆ JURION ┆ AGNES ┆ jurion agnes │\n", + "│ 270462 ┆ 10000101518 ┆ 2 ┆ BARREYRE ┆ SANDRINE ┆ barreyre sandrine │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 1619731 ┆ 10110987236 ┆ 2 ┆ ROGIER ┆ MATHILDE ┆ rogier mathilde │\n", + "│ 808810 ┆ 10111077417 ┆ 2 ┆ DOUVIER ┆ FRANCETTE ┆ douvier francette │\n", + "│ 2161999 ┆ 10111077417 ┆ 2 ┆ D'ELLOY ┆ FRANCETTE ┆ d elloy francette │\n", + "│ 538415 ┆ 10111110721 ┆ 2 ┆ ROCHEPEAU ┆ Nadège ┆ rochepeau nadege │\n", + "│ 2162425 ┆ 10111110721 ┆ 2 ┆ BARREAU ┆ Nadège ┆ barreau nadege │\n", + "└─────────┴────────────────┴───────┴────────────────┴───────────────────┴─────────────────────────┘" + ], + "text/html": [ + "
\n", + "shape: (3_584, 6)
indexIdentifiant PPCountNom d'exercicePrénom d'exerciceNom_Prénom_Nettoyé
u32stru32strstrstr
1352933"10000034180"2"DUWAT-GEORGES""GHISLAINE""duwat georges ghislaine"
1352934"10000034180"2"GEORGES""GHISLAINE""georges ghislaine"
1353009"10000046051"2"STUDER""AGNES""studer agnes"
1623173"10000046051"2"JURION""AGNES""jurion agnes"
270462"10000101518"2"BARREYRE""SANDRINE""barreyre sandrine"
1619731"10110987236"2"ROGIER""MATHILDE""rogier mathilde"
808810"10111077417"2"DOUVIER""FRANCETTE""douvier francette"
2161999"10111077417"2"D'ELLOY""FRANCETTE""d elloy francette"
538415"10111110721"2"ROCHEPEAU""Nadège""rochepeau nadege"
2162425"10111110721"2"BARREAU""Nadège""barreau nadege"
" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 62 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:46:03.290835Z", + "start_time": "2025-08-19T22:46:03.280259Z" + } + }, + "cell_type": "code", + "source": [ + "df3.write_csv(f\"{folder}{output_file}-Names_Variations_Normalized{output_extension}\",\n", + " separator='|',\n", + " quote_style=\"never\",\n", + " line_terminator='\\n')\n" + ], + "id": "aab2ae2e91a7190c", + "outputs": [], + "execution_count": 63 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:46:07.814563Z", + "start_time": "2025-08-19T22:46:03.493442Z" + } + }, + "cell_type": "code", + "source": [ + "%%time\n", + "all_columns = df.columns\n", + "start_col = 'Raison sociale site'\n", + "end_col = \"Libellé secteur d'activité\"\n", + "start_col_index = all_columns.index(start_col)\n", + "end_col_index = all_columns.index(end_col)\n", + "site_info_cols = all_columns[start_col_index : end_col_index + 1]\n", + "if \"Autorité d'enregistrement\" in site_info_cols:\n", + " site_info_cols.remove(\"Autorité d'enregistrement\")\n", + "\n", + "df4 = (\n", + " df\n", + " .filter(pd.col('Numéro FINESS site').is_null())\n", + " .filter(\n", + " pd.col('Numéro SIRET site').is_not_null() | pd.col('Identifiant technique de la structure').is_not_null()\n", + " )\n", + " .with_columns(\n", + " pd.coalesce(\n", + " pd.col('Numéro SIRET site'),\n", + " pd.col('Identifiant technique de la structure')\n", + " ).alias('Site_Identifier')\n", + " )\n", + " .with_columns(\n", + " pd.struct(site_info_cols).n_unique().over(['Identifiant PP', 'Site_Identifier']).alias('Site_Info_Variations_Count')\n", + " )\n", + " .filter(pd.col('Site_Info_Variations_Count') > 1)\n", + " .sort(['Identifiant PP', 'Site_Identifier', 'index'])\n", + " .select(['index', \n", + " 'Identifiant PP', \n", + " 'Site_Identifier', \n", + " 'Numéro SIRET site', \n", + " 'Identifiant technique de la structure', \n", + " 'Site_Info_Variations_Count'] + site_info_cols)\n", + ")\n", + "df4" + ], + "id": "3c2f2bb5fc3c2a5e", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 18.1 s\n", + "Wall time: 4.29 s\n" + ] + }, + { + "data": { + "text/plain": [ + "shape: (98, 31)\n", + "┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", + "│ index ┆ Identifian ┆ Site_Iden ┆ Numéro ┆ … ┆ Libellé ┆ Ancien ┆ Code ┆ Libellé │\n", + "│ --- ┆ t PP ┆ tifier ┆ SIRET ┆ ┆ Départeme ┆ identifia ┆ secteur ┆ secteur │\n", + "│ u32 ┆ --- ┆ --- ┆ site ┆ ┆ nt (struc ┆ nt de la ┆ d'activit ┆ d'activit │\n", + "│ ┆ str ┆ str ┆ --- ┆ ┆ ture… ┆ struc… ┆ é ┆ é │\n", + "│ ┆ ┆ ┆ str ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ str ┆ str │\n", + "╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", + "│ 270597 ┆ 1000011638 ┆ 309802205 ┆ 309802205 ┆ … ┆ null ┆ 330980220 ┆ SA28 ┆ Asso et │\n", + "│ ┆ 3 ┆ 00505 ┆ 00505 ┆ ┆ ┆ 500505 ┆ ┆ orga huma │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ nitaire │\n", + "│ 1353470 ┆ 1000011638 ┆ 309802205 ┆ 309802205 ┆ … ┆ null ┆ 330980220 ┆ SA28 ┆ Asso et │\n", + "│ ┆ 3 ┆ 00505 ┆ 00505 ┆ ┆ ┆ 500505 ┆ ┆ orga huma │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ nitaire │\n", + "│ 4214 ┆ 1000053630 ┆ 180035024 ┆ 180035024 ┆ … ┆ null ┆ 318003502 ┆ SA24 ┆ Organisme │\n", + "│ ┆ 9 ┆ 02369 ┆ 02369 ┆ ┆ ┆ 402369 ┆ ┆ de │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", + "│ 1627221 ┆ 1000053630 ┆ 180035024 ┆ 180035024 ┆ … ┆ null ┆ 318003502 ┆ SA24 ┆ Organisme │\n", + "│ ┆ 9 ┆ 02369 ┆ 02369 ┆ ┆ ┆ 402369 ┆ ┆ de │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", + "│ 816501 ┆ 1000066718 ┆ 180035024 ┆ 180035024 ┆ … ┆ null ┆ 318003502 ┆ SA24 ┆ Organisme │\n", + "│ ┆ 7 ┆ 02369 ┆ 02369 ┆ ┆ ┆ 402369 ┆ ┆ de │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 532248 ┆ 1011059216 ┆ 880859350 ┆ 880859350 ┆ … ┆ null ┆ 388085935 ┆ SA32 ┆ Fab. │\n", + "│ ┆ 8 ┆ 00014 ┆ 00014 ┆ ┆ ┆ 000014 ┆ ┆ Exploit. │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Import. │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Méd. DM │\n", + "│ 1344743 ┆ 1011059749 ┆ 130008006 ┆ 130008006 ┆ … ┆ null ┆ 313000800 ┆ SA24 ┆ Organisme │\n", + "│ ┆ 8 ┆ 00038 ┆ 00038 ┆ ┆ ┆ 600038 ┆ ┆ de │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", + "│ 2156205 ┆ 1011059749 ┆ 130008006 ┆ 130008006 ┆ … ┆ null ┆ 313000800 ┆ SA24 ┆ Organisme │\n", + "│ ┆ 8 ┆ 00038 ┆ 00038 ┆ ┆ ┆ 600038 ┆ ┆ de │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", + "│ 1618789 ┆ 1011091034 ┆ 180035024 ┆ 180035024 ┆ … ┆ null ┆ 318003502 ┆ SA24 ┆ Organisme │\n", + "│ ┆ 5 ┆ 01098 ┆ 01098 ┆ ┆ ┆ 401098 ┆ ┆ de │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", + "│ 1889827 ┆ 1011091034 ┆ 180035024 ┆ 180035024 ┆ … ┆ null ┆ 318003502 ┆ SA24 ┆ Organisme │\n", + "│ ┆ 5 ┆ 01098 ┆ 01098 ┆ ┆ ┆ 401098 ┆ ┆ de │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", + "└─────────┴────────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" + ], + "text/html": [ + "
\n", + "shape: (98, 31)
indexIdentifiant PPSite_IdentifierNuméro SIRET siteIdentifiant technique de la structureSite_Info_Variations_CountRaison sociale siteEnseigne commerciale siteComplément destinataire (coord. structure)Complément point géographique (coord. structure)Numéro Voie (coord. structure)Indice répétition voie (coord. structure)Code type de voie (coord. structure)Libellé type de voie (coord. structure)Libellé Voie (coord. structure)Mention distribution (coord. structure)Bureau cedex (coord. structure)Code postal (coord. structure)Code commune (coord. structure)Libellé commune (coord. structure)Code pays (coord. structure)Libellé pays (coord. structure)Téléphone (coord. structure)Téléphone 2 (coord. structure)Télécopie (coord. structure)Adresse e-mail (coord. structure)Code Département (structure)Libellé Département (structure)Ancien identifiant de la structureCode secteur d'activitéLibellé secteur d'activité
u32strstrstrstru32strstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstr
270597"10000116383""30980220500505""30980220500505""R10100000198782"2"OEUVRES HOSPITALIERES FRANCAIS…"ORDRE DE MALTE FRANCE""ORDRE DE MALTE FRANCE"null"42"nullnullnull"RUE DES VOLONTAIRES"null"75015 PARIS""75015""75056""Paris""99000""France"nullnullnullnullnullnull"330980220500505""SA28""Asso et orga humanitaire"
1353470"10000116383""30980220500505""30980220500505""R10100000779807"2"OEUVRE HOSP FRANC DE L'ORDRE D…nullnullnull"49"null"R""Rue""DE LA CHAPELLE"null"75018 PARIS 18E  ARRONDISSEMEN…"75018""75118""Paris 18e  Arrondissement"nullnullnullnullnullnullnullnull"330980220500505""SA28""Asso et orga humanitaire"
4214"10000536309""18003502402369""18003502402369""R10100000050224"2"DRSM PAYS DE LA LOIREELSM 44""SITE NANTES"nullnull"9"null"R""Rue""GAETAN RONDEAU""BP""44203 NANTES""44203""44109""Nantes"nullnullnullnullnullnullnullnull"318003502402369""SA24""Organisme de Sécurité Sociale"
1627221"10000536309""18003502402369""18003502402369""R10100000049799"2"DRSM PAYS DE LOIREELSM 49""SITE CHOLET"nullnull"2"null"R""Rue""SAINT ELOI""BP""49321 CHOLET""49321""49099""Cholet"nullnullnullnullnullnullnullnull"318003502402369""SA24""Organisme de Sécurité Sociale"
816501"10000667187""18003502402369""18003502402369""R10100000049794"3"DRSM NORD PICARDIEELSM 59""SITE MAUBEUGE"nullnullnullnull"PL""Place""DE WATTIGNIES""BP""59603 MAUBEUGE""59603""59392""Maubeuge"nullnullnullnullnullnullnullnull"318003502402369""SA24""Organisme de Sécurité Sociale"
532248"10110592168""88085935000014""88085935000014""R10100000325887"2"FAREVA PAU"nullnullnullnullnull"AV""Avenue""DU BEARN"null"64320 IDRON""64320""64269""Idron"nullnull"0559402100"null"0559402119"nullnullnull"388085935000014""SA32""Fab. Exploit. Import. Méd. DM"
1344743"10110597498""13000800600038""13000800600038""R10100000097229"2"AGENCE REGIONALE SANTE PAYS LO…nullnullnull"17"null"BD""Boulevard""GASTON DOUMERGUE"null"44262 NANTES""44262""44109""Nantes"nullnullnullnullnullnullnullnull"313000800600038""SA24""Organisme de Sécurité Sociale"
2156205"10110597498""13000800600038""13000800600038""R10100000097229"2"AGENCE REGIONALE SANTE PAYS LO…null"DELEGATION TERRITORIALE"null"2"null"BD""Boulevard""MURAT"null"53000 LAVAL""53000""53130""Laval"nullnullnullnullnullnullnullnull"313000800600038""SA24""Organisme de Sécurité Sociale"
1618789"10110910345""18003502401098""18003502401098""R10100000398898"2"CAISSE NATIONALE DE L'ASSURANC…"DRSM DIRECTION REG. DU SERVICE…"QUARTIER DU LAC"null"80"nullnullnull"AVENUE DE LA JALLERE""BP 260""33300 BORDEAUX""33300""33063""Bordeaux""99000""France"nullnullnullnullnullnull"318003502401098""SA24""Organisme de Sécurité Sociale"
1889827"10110910345""18003502401098""18003502401098""R10100000398898"2"CAISSE NATIONALE DE L'ASSURANC…"DRSM DIRECTION REG. DU SERVICE…nullnull"207"null"R""Rue""FONTAINEBLEAU""BP""40011 MONT-DE-MARSAN""40011""40192""Mont-de-Marsan"nullnullnullnullnullnullnullnull"318003502401098""SA24""Organisme de Sécurité Sociale"
" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 64 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:46:07.974271Z", + "start_time": "2025-08-19T22:46:07.943280Z" + } + }, + "cell_type": "code", + "source": [ + "df4.write_csv(f\"{folder}{output_file}-Sites_Variations{output_extension}\",\n", + " separator='|',\n", + " quote_style=\"never\",\n", + " line_terminator='\\n')\n" + ], + "id": "c1fd01e419f4ccc9", + "outputs": [], + "execution_count": 65 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:46:12.781888Z", + "start_time": "2025-08-19T22:46:08.306776Z" + } + }, + "cell_type": "code", + "source": [ + "%%time\n", + "df5 = (\n", + " df\n", + " .filter(pd.col('Numéro FINESS site').is_null())\n", + " .filter(\n", + " pd.col('Numéro SIRET site').is_not_null() | pd.col('Identifiant technique de la structure').is_not_null()\n", + " )\n", + " .with_columns(\n", + " pd.coalesce(\n", + " pd.col('Numéro SIRET site'),\n", + " pd.col('Identifiant technique de la structure')\n", + " ).alias('Site_Identifier')\n", + " )\n", + " .with_columns(\n", + " pd.struct(site_info_cols).n_unique().over(['Site_Identifier']).alias('Site_Info_Variations_Count')\n", + " )\n", + " .filter(pd.col('Site_Info_Variations_Count') > 1)\n", + " .sort(['Site_Identifier', 'index'])\n", + " .select(['index', \n", + " 'Identifiant PP', \n", + " 'Site_Identifier', \n", + " 'Numéro SIRET site', \n", + " 'Identifiant technique de la structure', \n", + " 'Site_Info_Variations_Count'] + site_info_cols)\n", + ")\n", + "df5\n" + ], + "id": "7838523925fc85ee", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 19.8 s\n", + "Wall time: 4.43 s\n" + ] + }, + { + "data": { + "text/plain": [ + "shape: (4_190, 31)\n", + "┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", + "│ index ┆ Identifian ┆ Site_Iden ┆ Numéro ┆ … ┆ Libellé ┆ Ancien ┆ Code ┆ Libellé │\n", + "│ --- ┆ t PP ┆ tifier ┆ SIRET ┆ ┆ Départeme ┆ identifia ┆ secteur ┆ secteur │\n", + "│ u32 ┆ --- ┆ --- ┆ site ┆ ┆ nt (struc ┆ nt de la ┆ d'activit ┆ d'activit │\n", + "│ ┆ str ┆ str ┆ --- ┆ ┆ ture… ┆ struc… ┆ é ┆ é │\n", + "│ ┆ ┆ ┆ str ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ str ┆ str │\n", + "╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", + "│ 127508 ┆ 1010000229 ┆ 056501711 ┆ 056501711 ┆ … ┆ null ┆ 305650171 ┆ SA32 ┆ Fab. │\n", + "│ ┆ 3 ┆ 00115 ┆ 00115 ┆ ┆ ┆ 100115 ┆ ┆ Exploit. │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Import. │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Méd. DM │\n", + "│ 285182 ┆ 1000180676 ┆ 056501711 ┆ 056501711 ┆ … ┆ null ┆ 305650171 ┆ SA32 ┆ Fab. │\n", + "│ ┆ 8 ┆ 00115 ┆ 00115 ┆ ┆ ┆ 100115 ┆ ┆ Exploit. │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Import. │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Méd. DM │\n", + "│ 466100 ┆ 1010480041 ┆ 056501711 ┆ 056501711 ┆ … ┆ null ┆ 305650171 ┆ SA32 ┆ Fab. │\n", + "│ ┆ 1 ┆ 00115 ┆ 00115 ┆ ┆ ┆ 100115 ┆ ┆ Exploit. │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Import. │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Méd. DM │\n", + "│ 722626 ┆ 1010368715 ┆ 056501711 ┆ 056501711 ┆ … ┆ null ┆ 305650171 ┆ SA32 ┆ Fab. │\n", + "│ ┆ 7 ┆ 00115 ┆ 00115 ┆ ┆ ┆ 100115 ┆ ┆ Exploit. │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Import. │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Méd. DM │\n", + "│ 826390 ┆ 1000179659 ┆ 056501711 ┆ 056501711 ┆ … ┆ null ┆ 305650171 ┆ SA32 ┆ Fab. │\n", + "│ ┆ 7 ┆ 00115 ┆ 00115 ┆ ┆ ┆ 100115 ┆ ┆ Exploit. │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Import. │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Méd. DM │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 793647 ┆ 1010986940 ┆ 984526194 ┆ 984526194 ┆ … ┆ null ┆ 398452619 ┆ SA09 ┆ Exercice │\n", + "│ ┆ 3 ┆ 00019 ┆ 00019 ┆ ┆ ┆ 400019 ┆ ┆ en │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Société │\n", + "│ 1500531 ┆ 1010129326 ┆ 984526194 ┆ 984526194 ┆ … ┆ null ┆ 398452619 ┆ SA09 ┆ Exercice │\n", + "│ ┆ 3 ┆ 00019 ┆ 00019 ┆ ┆ ┆ 400019 ┆ ┆ en │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Société │\n", + "│ 1852947 ┆ 1010801513 ┆ 984526194 ┆ 984526194 ┆ … ┆ null ┆ 398452619 ┆ SA09 ┆ Exercice │\n", + "│ ┆ 1 ┆ 00019 ┆ 00019 ┆ ┆ ┆ 400019 ┆ ┆ en │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Société │\n", + "│ 1571945 ┆ 1010723524 ┆ 998823504 ┆ 998823504 ┆ … ┆ null ┆ 399882350 ┆ SA11 ┆ Entrepris │\n", + "│ ┆ 3 ┆ 30834 ┆ 30834 ┆ ┆ ┆ 430834 ┆ ┆ e │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ d'intérim │\n", + "│ 1590681 ┆ 1010869303 ┆ 998823504 ┆ 998823504 ┆ … ┆ null ┆ 399882350 ┆ SA11 ┆ Entrepris │\n", + "│ ┆ 6 ┆ 30834 ┆ 30834 ┆ ┆ ┆ 430834 ┆ ┆ e │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ d'intérim │\n", + "└─────────┴────────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" + ], + "text/html": [ + "
\n", + "shape: (4_190, 31)
indexIdentifiant PPSite_IdentifierNuméro SIRET siteIdentifiant technique de la structureSite_Info_Variations_CountRaison sociale siteEnseigne commerciale siteComplément destinataire (coord. structure)Complément point géographique (coord. structure)Numéro Voie (coord. structure)Indice répétition voie (coord. structure)Code type de voie (coord. structure)Libellé type de voie (coord. structure)Libellé Voie (coord. structure)Mention distribution (coord. structure)Bureau cedex (coord. structure)Code postal (coord. structure)Code commune (coord. structure)Libellé commune (coord. structure)Code pays (coord. structure)Libellé pays (coord. structure)Téléphone (coord. structure)Téléphone 2 (coord. structure)Télécopie (coord. structure)Adresse e-mail (coord. structure)Code Département (structure)Libellé Département (structure)Ancien identifiant de la structureCode secteur d'activitéLibellé secteur d'activité
u32strstrstrstru32strstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstr
127508"10100002293""05650171100115""05650171100115""R10000001502146"2"BECTON DICKINSON FRANCE"nullnullnull"11"null"R""Rue""RUE ARISTIDE BERGES"null"38801 LE PONT DE CLAIX CEDEX""38801""38317""Le Pont-de-Claix""99000""France"nullnullnullnullnullnull"305650171100115""SA32""Fab. Exploit. Import. Méd. DM"
285182"10001806768""05650171100115""05650171100115""R10000001502146"2"BECTON DICKINSON FRANCE"nullnullnull"11"null"R""Rue""RUE ARISTIDE BERGES"null"38801 LE PONT DE CLAIX CEDEX""38801""38317""Le Pont-de-Claix""99000""France"nullnullnullnullnullnull"305650171100115""SA32""Fab. Exploit. Import. Méd. DM"
466100"10104800411""05650171100115""05650171100115""R10000001502146"2"BECTON DICKINSON FRANCE"nullnullnull"11"null"R""Rue""RUE ARISTIDE BERGES"null"38801 LE PONT DE CLAIX CEDEX""38801""38317""Le Pont-de-Claix""99000""France"nullnullnullnullnullnull"305650171100115""SA32""Fab. Exploit. Import. Méd. DM"
722626"10103687157""05650171100115""05650171100115""R10000001502146"2"BECTON DICKINSON FRANCE"nullnullnull"11"null"R""Rue""RUE ARISTIDE BERGES"null"38801 LE PONT DE CLAIX CEDEX""38801""38317""Le Pont-de-Claix""99000""France"nullnullnullnullnullnull"305650171100115""SA32""Fab. Exploit. Import. Méd. DM"
826390"10001796597""05650171100115""05650171100115""R10000001502146"2"BECTON DICKINSON FRANCE"nullnullnull"11"null"R""Rue""RUE ARISTIDE BERGES"null"38801 LE PONT DE CLAIX CEDEX""38801""38317""Le Pont-de-Claix""99000""France"nullnullnullnullnullnull"305650171100115""SA32""Fab. Exploit. Import. Méd. DM"
793647"10109869403""98452619400019""98452619400019""R10100000673943"2"SELARL CENTRE DE PODOLOGIE SPO…"SELARL CENTRE DE PODOLOGIE SPO…nullnull"8"null"R""Rue""GEORGES NEGREVERGNE"null"33700 MERIGNAC""33700""33281""Mérignac""99000""France"nullnullnullnullnullnull"398452619400019""SA09""Exercice en Société"
1500531"10101293263""98452619400019""98452619400019""R10100000673943"2"SELARL CENTRE DE PODOLOGIE SPO…"SELARL CENTRE DE PODOLOGIE SPO…nullnull"8"null"R""Rue""GEORGES NEGREVERGNE"null"33700 MERIGNAC""33700""33281""Mérignac""99000""France"nullnullnullnullnullnull"398452619400019""SA09""Exercice en Société"
1852947"10108015131""98452619400019""98452619400019""R10100000673943"2"SELARL CENTRE DE PODOLOGIE SPO…"SELARL CENTRE DE PODOLOGIE SPO…nullnull"8"null"R""Rue""GEORGES NEGREVERGNE"null"33700 MERIGNAC""33700""33281""Mérignac""99000""France"nullnullnullnullnullnull"398452619400019""SA09""Exercice en Société"
1571945"10107235243""99882350430834""99882350430834""R10100000554688"2"ADECCO FRANCE""ADECCO"null"PARC VALMY PARK AVENUE BAT A 1…"8""D"nullnull"RUE JEANNE BARRET"null"21000 DIJON""21000""21231""Dijon""99000""France"nullnullnullnullnullnull"399882350430834""SA11""Entreprise d'intérim"
1590681"10108693036""99882350430834""99882350430834""R10100000413248"2"ADECCO MEDICAL"nullnullnull"8""D""R""Rue""JEANNE BARRET"null"21000 DIJON""21000""21231""Dijon""99000""France"nullnullnullnullnullnull"399882350430834""SA11""Entreprise d'intérim"
" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 66 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:46:13.670911Z", + "start_time": "2025-08-19T22:46:13.655386Z" + } + }, + "cell_type": "code", + "source": [ + "df5.write_csv(f\"{folder}{output_file}-Sites_Variations_Global{output_extension}\",\n", + " separator='|',\n", + " quote_style=\"never\",\n", + " line_terminator='\\n')\n" + ], + "id": "416184f32f973a71", + "outputs": [], + "execution_count": 67 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:46:17.023811Z", + "start_time": "2025-08-19T22:46:14.032470Z" + } + }, + "cell_type": "code", + "source": [ + "%%time\n", + "df6 = (\n", + " df\n", + " .with_columns(\n", + " pd.coalesce(\n", + " pd.col('Numéro FINESS site'),\n", + " pd.col('Numéro SIRET site'),\n", + " pd.col('Identifiant technique de la structure')\n", + " ).alias('Site_Identifier_Global')\n", + " )\n", + " .filter(pd.col('Site_Identifier_Global').is_not_null())\n", + " .with_columns(\n", + " pd.struct([\n", + " \"Libellé profession\",\n", + " \"Libellé savoir-faire\",\n", + " \"Libellé mode exercice\",\n", + " \"Libellé rôle\",\n", + " \"Libellé genre activité\"\n", + " ]).n_unique().over([\"Identifiant PP\", \"Site_Identifier_Global\"]).alias(\"Activites_Count\")\n", + " )\n", + " .filter(pd.col(\"Activites_Count\") > 1)\n", + " .sort([\"Identifiant PP\", \"Site_Identifier_Global\", \"index\"])\n", + " .select([\n", + " \"index\",\n", + " \"Identifiant PP\",\n", + " \"Site_Identifier_Global\",\n", + " \"Numéro FINESS site\",\n", + " \"Numéro SIRET site\",\n", + " \"Identifiant technique de la structure\",\n", + " \"Activites_Count\",\n", + " \"Libellé profession\",\n", + " \"Libellé savoir-faire\",\n", + " \"Libellé mode exercice\",\n", + " \"Libellé rôle\",\n", + " \"Libellé genre activité\"\n", + " ])\n", + ")\n", + "df6\n" + ], + "id": "84549f83ce5e92f", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 11 s\n", + "Wall time: 2.96 s\n" + ] + }, + { + "data": { + "text/plain": [ + "shape: (25_389, 12)\n", + "┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", + "│ index ┆ Identifian ┆ Site_Iden ┆ Numéro ┆ … ┆ Libellé ┆ Libellé ┆ Libellé ┆ Libellé │\n", + "│ --- ┆ t PP ┆ tifier_Gl ┆ FINESS ┆ ┆ savoir-fa ┆ mode ┆ rôle ┆ genre │\n", + "│ u32 ┆ --- ┆ obal ┆ site ┆ ┆ ire ┆ exercice ┆ --- ┆ activité │\n", + "│ ┆ str ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ str ┆ --- │\n", + "│ ┆ ┆ str ┆ str ┆ ┆ str ┆ str ┆ ┆ str │\n", + "╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", + "│ 1352840 ┆ 1000001797 ┆ 130786445 ┆ 130786445 ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", + "│ ┆ 9 ┆ ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", + "│ 1893982 ┆ 1000001797 ┆ 130786445 ┆ 130786445 ┆ … ┆ null ┆ Salarié ┆ Fonction ┆ Activité │\n", + "│ ┆ 9 ┆ ┆ ┆ ┆ ┆ ┆ non ┆ standard │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ définie ┆ de soin │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", + "│ 270269 ┆ 1000007028 ┆ 397840901 ┆ null ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", + "│ ┆ 3 ┆ 00011 ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", + "│ 811380 ┆ 1000007028 ┆ 397840901 ┆ null ┆ … ┆ null ┆ Salarié ┆ Fonction ┆ Activité │\n", + "│ ┆ 3 ┆ 00011 ┆ ┆ ┆ ┆ ┆ non ┆ non │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ définie ┆ soignante │\n", + "│ 540586 ┆ 1000008684 ┆ 860012228 ┆ 860012228 ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", + "│ ┆ 2 ┆ ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 538572 ┆ 1011112354 ┆ 970400016 ┆ 970400016 ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", + "│ ┆ 2 ┆ ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", + "│ 538882 ┆ 1011125299 ┆ 250006954 ┆ 250006954 ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", + "│ ┆ 4 ┆ ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", + "│ 2162932 ┆ 1011125299 ┆ 250006954 ┆ 250006954 ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", + "│ ┆ 4 ┆ ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", + "│ 2163446 ┆ 1011129312 ┆ 490540218 ┆ 490540218 ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", + "│ ┆ 1 ┆ ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", + "│ 2163447 ┆ 1011129312 ┆ 490540218 ┆ 490540218 ┆ … ┆ null ┆ Salarié ┆ Cadre de ┆ Activité │\n", + "│ ┆ 1 ┆ ┆ ┆ ┆ ┆ ┆ santé de ┆ non │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ proximité ┆ soignante │\n", + "└─────────┴────────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" + ], + "text/html": [ + "
\n", + "shape: (25_389, 12)
indexIdentifiant PPSite_Identifier_GlobalNuméro FINESS siteNuméro SIRET siteIdentifiant technique de la structureActivites_CountLibellé professionLibellé savoir-faireLibellé mode exerciceLibellé rôleLibellé genre activité
u32strstrstrstrstru32strstrstrstrstr
1352840"10000017979""130786445""130786445""30247736900011""F130786445"2"Ostéopathe"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
1893982"10000017979""130786445""130786445""30247736900011""F130786445"2"Sage-Femme"null"Salarié""Fonction non définie""Activité standard de soin ou d…
270269"10000070283""39784090100011"null"39784090100011""R10000002500225"2"Chirurgien-Dentiste"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
811380"10000070283""39784090100011"null"39784090100011""R10000002500225"2"Chirurgien-Dentiste"null"Salarié""Fonction non définie""Activité non soignante"
540586"10000086842""860012228""860012228""13001256000038""F860012228"2"Psychologue"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
538572"10111123542""970400016""970400016""26974214400034""F970400016"2"Psychologue"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
538882"10111252994""250006954""250006954""26250176000264""F250006954"2"Psychothérapeute"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
2162932"10111252994""250006954""250006954""26250176000264""F250006954"2"Psychologue"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
2163446"10111293121""490540218""490540218""77568873211159""F490540218"2"Infirmier"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
2163447"10111293121""490540218""490540218""77568873211159""F490540218"2"Infirmier"null"Salarié""Cadre de santé de proximité""Activité non soignante"
" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 68 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:46:17.851427Z", + "start_time": "2025-08-19T22:46:17.796168Z" + } + }, + "cell_type": "code", + "source": [ + "df6.write_csv(f\"{folder}{output_file}-Multiple_Activities_Per_Site{output_extension}\",\n", + " separator='|',\n", + " quote_style=\"never\",\n", + " line_terminator='\\n')\n" + ], + "id": "6f7025a7c08b54b4", + "outputs": [], + "execution_count": 69 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:46:19.535052Z", + "start_time": "2025-08-19T22:46:18.015194Z" + } + }, + "cell_type": "code", + "source": [ + "%%time\n", + "df7 = (\n", + " df\n", + " .with_columns(\n", + " pd.col(\"Libellé profession\").n_unique().over(\"Identifiant PP\").alias(\"Profession_Count\")\n", + " )\n", + " .filter(pd.col(\"Profession_Count\") > 1)\n", + " .sort([\"Identifiant PP\", \"index\"])\n", + " .select([\n", + " \"index\",\n", + " \"Identifiant PP\",\n", + " \"Profession_Count\",\n", + " \"Libellé profession\",\n", + " \"Libellé savoir-faire\"\n", + " ])\n", + ")\n", + "df7\n" + ], + "id": "b18d9ba71ba63d9d", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 5.33 s\n", + "Wall time: 1.5 s\n" + ] + }, + { + "data": { + "text/plain": [ + "shape: (88_845, 5)\n", + "┌─────────┬────────────────┬──────────────────┬────────────────────┬──────────────────────┐\n", + "│ index ┆ Identifiant PP ┆ Profession_Count ┆ Libellé profession ┆ Libellé savoir-faire │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ str ┆ u32 ┆ str ┆ str │\n", + "╞═════════╪════════════════╪══════════════════╪════════════════════╪══════════════════════╡\n", + "│ 74 ┆ 10000013150 ┆ 2 ┆ Médecin ┆ Psychiatrie │\n", + "│ 269913 ┆ 10000013150 ┆ 2 ┆ Psychothérapeute ┆ null │\n", + "│ 1352840 ┆ 10000017979 ┆ 2 ┆ Ostéopathe ┆ null │\n", + "│ 1893982 ┆ 10000017979 ┆ 2 ┆ Sage-Femme ┆ null │\n", + "│ 811125 ┆ 10000029966 ┆ 2 ┆ Sage-Femme ┆ null │\n", + "│ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 1352396 ┆ 10111320304 ┆ 2 ┆ Psychothérapeute ┆ null │\n", + "│ 269545 ┆ 10111320379 ┆ 2 ┆ Psychothérapeute ┆ null │\n", + "│ 810601 ┆ 10111320379 ┆ 2 ┆ Psychologue ┆ null │\n", + "│ 539711 ┆ 10111321468 ┆ 2 ┆ Psychothérapeute ┆ null │\n", + "│ 1352414 ┆ 10111321468 ┆ 2 ┆ Psychologue ┆ null │\n", + "└─────────┴────────────────┴──────────────────┴────────────────────┴──────────────────────┘" + ], + "text/html": [ + "
\n", + "shape: (88_845, 5)
indexIdentifiant PPProfession_CountLibellé professionLibellé savoir-faire
u32stru32strstr
74"10000013150"2"Médecin""Psychiatrie"
269913"10000013150"2"Psychothérapeute"null
1352840"10000017979"2"Ostéopathe"null
1893982"10000017979"2"Sage-Femme"null
811125"10000029966"2"Sage-Femme"null
1352396"10111320304"2"Psychothérapeute"null
269545"10111320379"2"Psychothérapeute"null
810601"10111320379"2"Psychologue"null
539711"10111321468"2"Psychothérapeute"null
1352414"10111321468"2"Psychologue"null
" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 70 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-19T22:46:19.893214Z", + "start_time": "2025-08-19T22:46:19.851765Z" + } + }, + "cell_type": "code", + "source": [ + "df7.write_csv(f\"{folder}{output_file}-Multiple_Professions{output_extension}\",\n", + " separator='|',\n", + " quote_style=\"never\",\n", + " line_terminator='\\n')\n" + ], + "id": "85be468fd3f461d1", + "outputs": [], + "execution_count": 71 + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "language": "python", + "display_name": "Python 3 (ipykernel)" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Professionals_Multiple_Names2.ipynb b/Professionals_Multiple_Names2.ipynb new file mode 100644 index 0000000..14aa29b --- /dev/null +++ b/Professionals_Multiple_Names2.ipynb @@ -0,0 +1,465 @@ +{ + "cells": [ + { + "cell_type": "code", + "metadata": { + "tags": [], + "ExecuteTime": { + "end_time": "2025-08-12T19:54:05.652708Z", + "start_time": "2025-08-12T19:53:30.037989Z" + } + }, + "source": [ + "%%time\n", + "import pandas as pd\n", + "import csv\n", + "df = pd.read_csv(\"E:\\\\Ziwig Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\Table_Réf_Professionnels_250804.txt\", sep='|',\n", + " doublequote=False, quoting=csv.QUOTE_NONE, dtype=str, na_values='', keep_default_na=False)\n", + "df.index.name = 'index'\n", + "df.columns" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 35 s\n", + "Wall time: 35.6 s\n" + ] + }, + { + "data": { + "text/plain": [ + "Index(['Type d'identifiant PP', 'Identifiant PP',\n", + " 'Identification nationale PP', 'Code civilité d'exercice',\n", + " 'Libellé civilité d'exercice', 'Code civilité', 'Libellé civilité',\n", + " 'Nom d'exercice', 'Prénom d'exercice', 'Code profession',\n", + " 'Libellé profession', 'Code catégorie professionnelle',\n", + " 'Libellé catégorie professionnelle', 'Code type savoir-faire',\n", + " 'Libellé type savoir-faire', 'Code savoir-faire',\n", + " 'Libellé savoir-faire', 'Code mode exercice', 'Libellé mode exercice',\n", + " 'Numéro SIRET site', 'Numéro SIREN site', 'Numéro FINESS site',\n", + " 'Numéro FINESS établissement juridique',\n", + " 'Identifiant technique de la structure', 'Raison sociale site',\n", + " 'Enseigne commerciale site',\n", + " 'Complément destinataire (coord. structure)',\n", + " 'Complément point géographique (coord. structure)',\n", + " 'Numéro Voie (coord. structure)',\n", + " 'Indice répétition voie (coord. structure)',\n", + " 'Code type de voie (coord. structure)',\n", + " 'Libellé type de voie (coord. structure)',\n", + " 'Libellé Voie (coord. structure)',\n", + " 'Mention distribution (coord. structure)',\n", + " 'Bureau cedex (coord. structure)', 'Code postal (coord. structure)',\n", + " 'Code commune (coord. structure)', 'Libellé commune (coord. structure)',\n", + " 'Code pays (coord. structure)', 'Libellé pays (coord. structure)',\n", + " 'Téléphone (coord. structure)', 'Téléphone 2 (coord. structure)',\n", + " 'Télécopie (coord. structure)', 'Adresse e-mail (coord. structure)',\n", + " 'Code Département (structure)', 'Libellé Département (structure)',\n", + " 'Ancien identifiant de la structure', 'Autorité d'enregistrement',\n", + " 'Code secteur d'activité', 'Libellé secteur d'activité',\n", + " 'Code section tableau pharmaciens',\n", + " 'Libellé section tableau pharmaciens', 'Code rôle', 'Libellé rôle',\n", + " 'Code genre activité', 'Libellé genre activité', 'Unnamed: 56'],\n", + " dtype='object')" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 19 + }, + { + "cell_type": "code", + "metadata": { + "tags": [], + "ExecuteTime": { + "end_time": "2025-08-12T19:54:12.829107Z", + "start_time": "2025-08-12T19:54:05.751406Z" + } + }, + "source": [ + "%%time\n", + "df2 = df.assign(Nom_Prénom = df[\"Nom d'exercice\"]+\" \"+df[\"Prénom d'exercice\"]) \\\n", + " .groupby('Identifiant PP')[['Nom_Prénom']] \\\n", + " .transform('nunique').rename(columns={'Nom_Prénom' : 'Count'}).query('Count > 1') \\\n", + " .join(df)[['Identifiant PP','Count',\"Nom d'exercice\",\"Prénom d'exercice\"]] \\\n", + " .sort_values(['Identifiant PP', 'index'])\n", + "\n", + "df2" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 6.81 s\n", + "Wall time: 7.06 s\n" + ] + }, + { + "data": { + "text/plain": [ + " Identifiant PP Count Nom d'exercice Prénom d'exercice\n", + "index \n", + "1350393 10000034180 2 DUWAT-GEORGES GHISLAINE\n", + "1350394 10000034180 2 GEORGES GHISLAINE\n", + "259 10000040062 2 MEYER Nicolas\n", + "260 10000040062 2 MEYER Nicolas\n", + "809702 10000040062 2 MEYER NICOLAS\n", + "... ... ... ... ...\n", + "2158383 10111077417 2 D'ELLOY FRANCETTE\n", + "537896 10111105358 2 HOMO Maddy\n", + "1889090 10111105358 2 Homo Maddy\n", + "537977 10111110721 2 ROCHEPEAU Nadège\n", + "2158797 10111110721 2 BARREAU Nadège\n", + "\n", + "[9059 rows x 4 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Identifiant PPCountNom d'exercicePrénom d'exercice
index
1350393100000341802DUWAT-GEORGESGHISLAINE
1350394100000341802GEORGESGHISLAINE
259100000400622MEYERNicolas
260100000400622MEYERNicolas
809702100000400622MEYERNICOLAS
...............
2158383101110774172D'ELLOYFRANCETTE
537896101111053582HOMOMaddy
1889090101111053582HomoMaddy
537977101111107212ROCHEPEAUNadège
2158797101111107212BARREAUNadège
\n", + "

9059 rows × 4 columns

\n", + "
" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 20 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-12T19:54:13.114103Z", + "start_time": "2025-08-12T19:54:13.063080Z" + } + }, + "cell_type": "code", + "source": [ + "df2.to_csv(\"E:\\\\Ziwig Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\Table_Réf_Professionnels_250804_Multiple_Names_2-1.csv\",\n", + " sep='|', index=True, doublequote=False, quoting=csv.QUOTE_NONE, lineterminator='\\n')" + ], + "outputs": [], + "execution_count": 21 + }, + { + "cell_type": "code", + "metadata": { + "tags": [], + "ExecuteTime": { + "end_time": "2025-08-12T19:54:20.671679Z", + "start_time": "2025-08-12T19:54:13.377047Z" + } + }, + "source": [ + "%%time\n", + "df3 = df.assign(Nom_Prénom = df[\"Nom d'exercice\"].str.lower()+\" \"+df[\"Prénom d'exercice\"].str.lower()) \\\n", + " .groupby('Identifiant PP')[['Nom_Prénom']] \\\n", + " .transform('nunique').rename(columns={'Nom_Prénom' : 'Count'}).query('Count > 1') \\\n", + " .join(df)[['Identifiant PP','Count',\"Nom d'exercice\",\"Prénom d'exercice\"]] \\\n", + " .sort_values(['Identifiant PP', 'index'])\n", + "df3" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 6.97 s\n", + "Wall time: 7.28 s\n" + ] + }, + { + "data": { + "text/plain": [ + " Identifiant PP Count Nom d'exercice Prénom d'exercice\n", + "index \n", + "1350393 10000034180 2 DUWAT-GEORGES GHISLAINE\n", + "1350394 10000034180 2 GEORGES GHISLAINE\n", + "1350470 10000046051 2 STUDER AGNES\n", + "1620048 10000046051 2 JURION AGNES\n", + "269964 10000101518 2 BARREYRE SANDRINE\n", + "... ... ... ... ...\n", + "1617156 10110987236 2 ROGIER MATHILDE\n", + "807882 10111077417 2 DOUVIER FRANCETTE\n", + "2158383 10111077417 2 D'ELLOY FRANCETTE\n", + "537977 10111110721 2 ROCHEPEAU Nadège\n", + "2158797 10111110721 2 BARREAU Nadège\n", + "\n", + "[5395 rows x 4 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Identifiant PPCountNom d'exercicePrénom d'exercice
index
1350393100000341802DUWAT-GEORGESGHISLAINE
1350394100000341802GEORGESGHISLAINE
1350470100000460512STUDERAGNES
1620048100000460512JURIONAGNES
269964100001015182BARREYRESANDRINE
...............
1617156101109872362ROGIERMATHILDE
807882101110774172DOUVIERFRANCETTE
2158383101110774172D'ELLOYFRANCETTE
537977101111107212ROCHEPEAUNadège
2158797101111107212BARREAUNadège
\n", + "

5395 rows × 4 columns

\n", + "
" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 22 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-12T19:54:21.102182Z", + "start_time": "2025-08-12T19:54:21.072806Z" + } + }, + "cell_type": "code", + "source": [ + "df3.to_csv(\"E:\\\\Ziwig Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\Table_Réf_Professionnels_250804_Multiple_Names_2-2.csv\",\n", + " sep='|', index=True, doublequote=False, quoting=csv.QUOTE_NONE, lineterminator='\\n')" + ], + "outputs": [], + "execution_count": 23 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Professionals_Multiple_Names3.ipynb b/Professionals_Multiple_Names3.ipynb new file mode 100644 index 0000000..4e43354 --- /dev/null +++ b/Professionals_Multiple_Names3.ipynb @@ -0,0 +1,439 @@ +{ + "cells": [ + { + "cell_type": "code", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "ExecuteTime": { + "end_time": "2025-08-11T22:50:03.135959Z", + "start_time": "2025-08-11T22:49:26.824618Z" + } + }, + "source": [ + "%%time\n", + "import pandas as pd\n", + "import csv\n", + "df = pd.read_csv(\"E:\\\\Ziwig Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\Table_Réf_Professionnels_250804.txt\",\n", + " sep='|', doublequote=False, quoting=csv.QUOTE_NONE, dtype=str, na_values='', keep_default_na=False)\n", + "df.index.name = 'index'\n", + "df.columns" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 35.9 s\n", + "Wall time: 36.3 s\n" + ] + }, + { + "data": { + "text/plain": [ + "Index(['Type d'identifiant PP', 'Identifiant PP',\n", + " 'Identification nationale PP', 'Code civilité d'exercice',\n", + " 'Libellé civilité d'exercice', 'Code civilité', 'Libellé civilité',\n", + " 'Nom d'exercice', 'Prénom d'exercice', 'Code profession',\n", + " 'Libellé profession', 'Code catégorie professionnelle',\n", + " 'Libellé catégorie professionnelle', 'Code type savoir-faire',\n", + " 'Libellé type savoir-faire', 'Code savoir-faire',\n", + " 'Libellé savoir-faire', 'Code mode exercice', 'Libellé mode exercice',\n", + " 'Numéro SIRET site', 'Numéro SIREN site', 'Numéro FINESS site',\n", + " 'Numéro FINESS établissement juridique',\n", + " 'Identifiant technique de la structure', 'Raison sociale site',\n", + " 'Enseigne commerciale site',\n", + " 'Complément destinataire (coord. structure)',\n", + " 'Complément point géographique (coord. structure)',\n", + " 'Numéro Voie (coord. structure)',\n", + " 'Indice répétition voie (coord. structure)',\n", + " 'Code type de voie (coord. structure)',\n", + " 'Libellé type de voie (coord. structure)',\n", + " 'Libellé Voie (coord. structure)',\n", + " 'Mention distribution (coord. structure)',\n", + " 'Bureau cedex (coord. structure)', 'Code postal (coord. structure)',\n", + " 'Code commune (coord. structure)', 'Libellé commune (coord. structure)',\n", + " 'Code pays (coord. structure)', 'Libellé pays (coord. structure)',\n", + " 'Téléphone (coord. structure)', 'Téléphone 2 (coord. structure)',\n", + " 'Télécopie (coord. structure)', 'Adresse e-mail (coord. structure)',\n", + " 'Code Département (structure)', 'Libellé Département (structure)',\n", + " 'Ancien identifiant de la structure', 'Autorité d'enregistrement',\n", + " 'Code secteur d'activité', 'Libellé secteur d'activité',\n", + " 'Code section tableau pharmaciens',\n", + " 'Libellé section tableau pharmaciens', 'Code rôle', 'Libellé rôle',\n", + " 'Code genre activité', 'Libellé genre activité', 'Unnamed: 56'],\n", + " dtype='object')" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 38 + }, + { + "cell_type": "code", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "ExecuteTime": { + "end_time": "2025-08-11T22:53:55.986443Z", + "start_time": "2025-08-11T22:50:03.157898Z" + } + }, + "source": [ + "%%time\n", + "df2 = df.assign(Nom_Prénom = df[\"Nom d'exercice\"]+\" \"+df[\"Prénom d'exercice\"]) \\\n", + " [['Identifiant PP','Nom_Prénom',\"Nom d'exercice\",\"Prénom d'exercice\"]] \\\n", + " .groupby('Identifiant PP') \\\n", + " .filter(lambda f: f['Nom_Prénom'].nunique()>1) \\\n", + " .sort_values(['Identifiant PP','index']) \\\n", + " [['Identifiant PP',\"Nom d'exercice\",\"Prénom d'exercice\"]]\n", + "df2" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 3min 44s\n", + "Wall time: 3min 52s\n" + ] + }, + { + "data": { + "text/plain": [ + " Identifiant PP Nom d'exercice Prénom d'exercice\n", + "index \n", + "1350393 10000034180 DUWAT-GEORGES GHISLAINE\n", + "1350394 10000034180 GEORGES GHISLAINE\n", + "259 10000040062 MEYER Nicolas\n", + "260 10000040062 MEYER Nicolas\n", + "809702 10000040062 MEYER NICOLAS\n", + "... ... ... ...\n", + "2158383 10111077417 D'ELLOY FRANCETTE\n", + "537896 10111105358 HOMO Maddy\n", + "1889090 10111105358 Homo Maddy\n", + "537977 10111110721 ROCHEPEAU Nadège\n", + "2158797 10111110721 BARREAU Nadège\n", + "\n", + "[9059 rows x 3 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Identifiant PPNom d'exercicePrénom d'exercice
index
135039310000034180DUWAT-GEORGESGHISLAINE
135039410000034180GEORGESGHISLAINE
25910000040062MEYERNicolas
26010000040062MEYERNicolas
80970210000040062MEYERNICOLAS
............
215838310111077417D'ELLOYFRANCETTE
53789610111105358HOMOMaddy
188909010111105358HomoMaddy
53797710111110721ROCHEPEAUNadège
215879710111110721BARREAUNadège
\n", + "

9059 rows × 3 columns

\n", + "
" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 39 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-11T22:53:56.801020Z", + "start_time": "2025-08-11T22:53:56.699295Z" + } + }, + "cell_type": "code", + "source": "df2.to_csv(\"E:\\\\Ziwig Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\Table_Réf_Professionnels_250804_Multiple_Names_3-1.csv\", sep='|', index=True, doublequote=False, quoting=csv.QUOTE_NONE, lineterminator='\\n')", + "outputs": [], + "execution_count": 40 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-11T22:57:28.643070Z", + "start_time": "2025-08-11T22:53:56.870889Z" + } + }, + "cell_type": "code", + "source": [ + "%%time\n", + "df3 = df.assign(Nom_Prénom = df[\"Nom d'exercice\"].str.lower()+\" \"+df[\"Prénom d'exercice\"].str.lower()) \\\n", + " [['Identifiant PP','Nom_Prénom',\"Nom d'exercice\",\"Prénom d'exercice\"]] \\\n", + " .groupby('Identifiant PP') \\\n", + " .filter(lambda f: f['Nom_Prénom'].nunique()>1) \\\n", + " .sort_values(['Identifiant PP','index']) \\\n", + " [['Identifiant PP',\"Nom d'exercice\",\"Prénom d'exercice\"]]\n", + "df3" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 3min 26s\n", + "Wall time: 3min 31s\n" + ] + }, + { + "data": { + "text/plain": [ + " Identifiant PP Nom d'exercice Prénom d'exercice\n", + "index \n", + "1350393 10000034180 DUWAT-GEORGES GHISLAINE\n", + "1350394 10000034180 GEORGES GHISLAINE\n", + "1350470 10000046051 STUDER AGNES\n", + "1620048 10000046051 JURION AGNES\n", + "269964 10000101518 BARREYRE SANDRINE\n", + "... ... ... ...\n", + "1617156 10110987236 ROGIER MATHILDE\n", + "807882 10111077417 DOUVIER FRANCETTE\n", + "2158383 10111077417 D'ELLOY FRANCETTE\n", + "537977 10111110721 ROCHEPEAU Nadège\n", + "2158797 10111110721 BARREAU Nadège\n", + "\n", + "[5395 rows x 3 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Identifiant PPNom d'exercicePrénom d'exercice
index
135039310000034180DUWAT-GEORGESGHISLAINE
135039410000034180GEORGESGHISLAINE
135047010000046051STUDERAGNES
162004810000046051JURIONAGNES
26996410000101518BARREYRESANDRINE
............
161715610110987236ROGIERMATHILDE
80788210111077417DOUVIERFRANCETTE
215838310111077417D'ELLOYFRANCETTE
53797710111110721ROCHEPEAUNadège
215879710111110721BARREAUNadège
\n", + "

5395 rows × 3 columns

\n", + "
" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 41 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-11T22:57:29.038232Z", + "start_time": "2025-08-11T22:57:29.014447Z" + } + }, + "cell_type": "code", + "source": "df3.to_csv(\"E:\\\\Ziwig Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\Table_Réf_Professionnels_250804_Multiple_Names_3-2.csv\", sep='|', index=True, doublequote=False, quoting=csv.QUOTE_NONE, lineterminator='\\n')", + "outputs": [], + "execution_count": 42 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Professionals_Sort.ipynb b/Professionals_Sort.ipynb new file mode 100644 index 0000000..905ac95 --- /dev/null +++ b/Professionals_Sort.ipynb @@ -0,0 +1,140 @@ +{ + "cells": [ + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import csv\n", + "filename = 'Table_Réf_Professionnels_250815'\n", + "df = pd.read_csv(\"E:\\\\Ziwig Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\\"+filename+\".csv\", sep='|', doublequote=False, quoting=csv.QUOTE_NONE,dtype=str, na_values='', keep_default_na=False)\n", + "df.columns" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2025-08-16T01:35:13.352289Z", + "start_time": "2025-08-16T01:34:58.550068Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Type d'identifiant PP', 'Identifiant PP',\n", + " 'Identification nationale PP', 'Code civilité d'exercice',\n", + " 'Libellé civilité d'exercice', 'Code civilité', 'Libellé civilité',\n", + " 'Nom d'exercice', 'Prénom d'exercice', 'Code profession',\n", + " 'Libellé profession', 'Code catégorie professionnelle',\n", + " 'Libellé catégorie professionnelle', 'Code type savoir-faire',\n", + " 'Libellé type savoir-faire', 'Code savoir-faire',\n", + " 'Libellé savoir-faire', 'Code mode exercice', 'Libellé mode exercice',\n", + " 'Numéro SIRET site', 'Numéro SIREN site', 'Numéro FINESS site',\n", + " 'Numéro FINESS établissement juridique',\n", + " 'Identifiant technique de la structure', 'Raison sociale site',\n", + " 'Enseigne commerciale site',\n", + " 'Complément destinataire (coord. structure)',\n", + " 'Complément point géographique (coord. structure)',\n", + " 'Numéro Voie (coord. structure)',\n", + " 'Indice répétition voie (coord. structure)',\n", + " 'Code type de voie (coord. structure)',\n", + " 'Libellé type de voie (coord. structure)',\n", + " 'Libellé Voie (coord. structure)',\n", + " 'Mention distribution (coord. structure)',\n", + " 'Bureau cedex (coord. structure)', 'Code postal (coord. structure)',\n", + " 'Code commune (coord. structure)', 'Libellé commune (coord. structure)',\n", + " 'Code pays (coord. structure)', 'Libellé pays (coord. structure)',\n", + " 'Téléphone (coord. structure)', 'Téléphone 2 (coord. structure)',\n", + " 'Télécopie (coord. structure)', 'Adresse e-mail (coord. structure)',\n", + " 'Code Département (structure)', 'Libellé Département (structure)',\n", + " 'Ancien identifiant de la structure', 'Autorité d'enregistrement',\n", + " 'Code secteur d'activité', 'Libellé secteur d'activité',\n", + " 'Code section tableau pharmaciens',\n", + " 'Libellé section tableau pharmaciens', 'Code rôle', 'Libellé rôle',\n", + " 'Code genre activité', 'Libellé genre activité', 'Unnamed: 56'],\n", + " dtype='object')" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 1 + }, + { + "cell_type": "code", + "source": [ + "df.shape" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2025-08-16T01:35:13.450995Z", + "start_time": "2025-08-16T01:35:13.442103Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(994582, 57)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 2 + }, + { + "cell_type": "code", + "source": [ + "df_sorted = df.sort_values([\"Identifiant PP\", \"Nom d'exercice\", \"Prénom d'exercice\", 'Libellé profession', 'Libellé savoir-faire', 'Bureau cedex (coord. structure)', 'Numéro Voie (coord. structure)', 'Indice répétition voie (coord. structure)', 'Libellé type de voie (coord. structure)', 'Libellé Voie (coord. structure)' , 'Mention distribution (coord. structure)', 'Téléphone (coord. structure)'])" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2025-08-16T01:35:20.312959Z", + "start_time": "2025-08-16T01:35:13.656674Z" + } + }, + "outputs": [], + "execution_count": 3 + }, + { + "cell_type": "code", + "source": "df_sorted.to_csv(\"E:\\\\Ziwig Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\\"+filename+\"-sorted.csv\", sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE, lineterminator='\\n')", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2025-08-16T01:35:34.463854Z", + "start_time": "2025-08-16T01:35:20.454076Z" + } + }, + "outputs": [], + "execution_count": 4 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Resendo.ipynb b/Resendo.ipynb new file mode 100644 index 0000000..a78fa2d --- /dev/null +++ b/Resendo.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "df = pd.read_excel(\"G:\\Mon Drive\\Ziwig-Health\\Data\\Extract_Prof_Patient_List.xlsx\", header=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": "(7728, 9)" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "timedDf = df.set_index('createdAt')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "timedDf['count']=True" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "timedDf['NonStarted']=1-timedDf['isStartMedicalRecord'].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "timedDf['NonFinished']=1-timedDf['isFinishMedicalRecord'].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "timedDf=timedDf.loc[:, ['isStartMedicalRecord','isFinishMedicalRecord','count','NonStarted','NonFinished']]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "timedDfMonthly = timedDf.resample('M')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": " isStartMedicalRecord isFinishMedicalRecord count NonStarted \\\ncreatedAt \n2020-11-30 2 2 2 0 \n2020-12-31 3 3 3 0 \n2021-01-31 21 21 21 0 \n2021-02-28 10 10 10 0 \n2021-03-31 348 323 404 56 \n2021-04-30 602 559 712 110 \n2021-05-31 511 465 622 111 \n2021-06-30 406 372 503 97 \n2021-07-31 426 398 498 72 \n2021-08-31 429 393 528 99 \n2021-09-30 561 517 677 116 \n2021-10-31 580 539 696 116 \n2021-11-30 453 416 557 104 \n2021-12-31 480 447 608 128 \n2022-01-31 608 562 786 178 \n2022-02-28 544 502 704 160 \n2022-03-31 286 255 397 111 \n\n NonFinished \ncreatedAt \n2020-11-30 0 \n2020-12-31 0 \n2021-01-31 0 \n2021-02-28 0 \n2021-03-31 81 \n2021-04-30 153 \n2021-05-31 157 \n2021-06-30 131 \n2021-07-31 100 \n2021-08-31 135 \n2021-09-30 160 \n2021-10-31 157 \n2021-11-30 141 \n2021-12-31 161 \n2022-01-31 224 \n2022-02-28 202 \n2022-03-31 142 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
isStartMedicalRecordisFinishMedicalRecordcountNonStartedNonFinished
createdAt
2020-11-3022200
2020-12-3133300
2021-01-3121212100
2021-02-2810101000
2021-03-313483234045681
2021-04-30602559712110153
2021-05-31511465622111157
2021-06-3040637250397131
2021-07-3142639849872100
2021-08-3142939352899135
2021-09-30561517677116160
2021-10-31580539696116157
2021-11-30453416557104141
2021-12-31480447608128161
2022-01-31608562786178224
2022-02-28544502704160202
2022-03-31286255397111142
\n
" + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "timedDfMonthly.sum()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/Resendo2.ipynb b/Resendo2.ipynb new file mode 100644 index 0000000..7e21ad2 --- /dev/null +++ b/Resendo2.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 5.89 s\n", + "Wall time: 6.14 s\n" + ] + }, + { + "data": { + "text/plain": "(31371, 9)" + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "import pandas as pd\n", + "df = pd.read_excel(\"G:\\Mon Drive\\Ziwig-Health\\Data\\Extract_Prof_Patient_List_new.xlsx\", header=2)\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 46.9 ms\n", + "Wall time: 49.9 ms\n" + ] + }, + { + "data": { + "text/plain": "isStartMedicalRecord False True \nisFinishMedicalRecord False False True\ncreatedAt \n2020-11-30 169 9 78\n2020-12-31 226 16 147\n2021-01-31 149 388 842\n2021-02-28 238 164 606\n2021-03-31 652 453 2262\n2021-04-30 250 118 1141\n2021-05-31 269 144 1106\n2021-06-30 283 150 1012\n2021-07-31 227 127 883\n2021-08-31 196 111 912\n2021-09-30 223 142 1254\n2021-10-31 224 112 1176\n2021-11-30 229 110 988\n2021-12-31 466 111 925\n2022-01-31 753 287 1766\n2022-02-28 1095 549 2362\n2022-03-31 520 176 1242\n2022-04-30 395 125 849\n2022-05-31 363 99 771\n2022-06-30 233 65 433", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
isStartMedicalRecordFalseTrue
isFinishMedicalRecordFalseFalseTrue
createdAt
2020-11-30169978
2020-12-3122616147
2021-01-31149388842
2021-02-28238164606
2021-03-316524532262
2021-04-302501181141
2021-05-312691441106
2021-06-302831501012
2021-07-31227127883
2021-08-31196111912
2021-09-302231421254
2021-10-312241121176
2021-11-30229110988
2021-12-31466111925
2022-01-317532871766
2022-02-2810955492362
2022-03-315201761242
2022-04-30395125849
2022-05-3136399771
2022-06-3023365433
\n
" + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "pd.pivot_table(data = df, index=pd.Grouper(key=\"createdAt\", freq=\"M\"), columns=[\"isStartMedicalRecord\",\"isFinishMedicalRecord\"], values=\"fullName\", aggfunc=\"count\", fill_value= 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 46.9 ms\n", + "Wall time: 46.9 ms\n" + ] + }, + { + "data": { + "text/plain": "isStartMedicalRecord False True\ncreatedAt \n2020-11-30 169 87\n2020-12-31 226 163\n2021-01-31 149 1230\n2021-02-28 238 770\n2021-03-31 652 2715\n2021-04-30 250 1259\n2021-05-31 269 1250\n2021-06-30 283 1162\n2021-07-31 227 1010\n2021-08-31 196 1023\n2021-09-30 223 1396\n2021-10-31 224 1288\n2021-11-30 229 1098\n2021-12-31 466 1036\n2022-01-31 753 2053\n2022-02-28 1095 2911\n2022-03-31 520 1418\n2022-04-30 395 974\n2022-05-31 363 870\n2022-06-30 233 498", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
isStartMedicalRecordFalseTrue
createdAt
2020-11-3016987
2020-12-31226163
2021-01-311491230
2021-02-28238770
2021-03-316522715
2021-04-302501259
2021-05-312691250
2021-06-302831162
2021-07-312271010
2021-08-311961023
2021-09-302231396
2021-10-312241288
2021-11-302291098
2021-12-314661036
2022-01-317532053
2022-02-2810952911
2022-03-315201418
2022-04-30395974
2022-05-31363870
2022-06-30233498
\n
" + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "pd.pivot_table(data = df, index=pd.Grouper(key=\"createdAt\", freq=\"M\"), columns=[\"isStartMedicalRecord\"], values=\"fullName\", aggfunc=\"count\", fill_value=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 62.5 ms\n", + "Wall time: 58.8 ms\n" + ] + }, + { + "data": { + "text/plain": "isFinishMedicalRecord False True\ncreatedAt \n2020-11-30 178 78\n2020-12-31 242 147\n2021-01-31 537 842\n2021-02-28 402 606\n2021-03-31 1105 2262\n2021-04-30 368 1141\n2021-05-31 413 1106\n2021-06-30 433 1012\n2021-07-31 354 883\n2021-08-31 307 912\n2021-09-30 365 1254\n2021-10-31 336 1176\n2021-11-30 339 988\n2021-12-31 577 925\n2022-01-31 1040 1766\n2022-02-28 1644 2362\n2022-03-31 696 1242\n2022-04-30 520 849\n2022-05-31 462 771\n2022-06-30 298 433", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
isFinishMedicalRecordFalseTrue
createdAt
2020-11-3017878
2020-12-31242147
2021-01-31537842
2021-02-28402606
2021-03-3111052262
2021-04-303681141
2021-05-314131106
2021-06-304331012
2021-07-31354883
2021-08-31307912
2021-09-303651254
2021-10-313361176
2021-11-30339988
2021-12-31577925
2022-01-3110401766
2022-02-2816442362
2022-03-316961242
2022-04-30520849
2022-05-31462771
2022-06-30298433
\n
" + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "pd.pivot_table(data = df, index=pd.Grouper(key=\"createdAt\", freq=\"M\"), columns=[\"isFinishMedicalRecord\"], values=\"fullName\", aggfunc=\"count\", fill_value=0)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/Table_Réf_Professionnels - Copie.xlsx b/Table_Réf_Professionnels - Copie.xlsx new file mode 100644 index 0000000..c7007a3 Binary files /dev/null and b/Table_Réf_Professionnels - Copie.xlsx differ diff --git a/Table_Réf_Professionnels.xlsx b/Table_Réf_Professionnels.xlsx new file mode 100644 index 0000000..18e8f47 Binary files /dev/null and b/Table_Réf_Professionnels.xlsx differ diff --git a/Table_Réf_Professionnels_inconsistencies.xlsx b/Table_Réf_Professionnels_inconsistencies.xlsx new file mode 100644 index 0000000..675a028 Binary files /dev/null and b/Table_Réf_Professionnels_inconsistencies.xlsx differ diff --git a/datacompyProfessionals.ipynb b/datacompyProfessionals.ipynb new file mode 100644 index 0000000..fdbe63c --- /dev/null +++ b/datacompyProfessionals.ipynb @@ -0,0 +1,276 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 48, + "id": "66b27b71bfe4a1e6", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-07T16:35:09.000042Z", + "start_time": "2024-03-07T16:35:08.917093Z" + } + }, + "outputs": [], + "source": [ + "\n", + "import pandas as pd\n", + "import datacompy\n", + "import csv" + ] + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "df1 = pd.read_csv(\"H:\\\\Mon Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\Table_Réf_Professionnels_240103.txt\", sep='|', doublequote=False, quoting=csv.QUOTE_NONE,dtype=str, na_values='', keep_default_na=False)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-07T16:35:57.199406Z", + "start_time": "2024-03-07T16:35:09.010084Z" + } + }, + "id": "2bf7e140e6e3a0cf", + "execution_count": 49 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "df2 = pd.read_csv(\"H:\\\\Mon Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\Table_Réf_Professionnels_240307.txt\", sep='|', doublequote=False, quoting=csv.QUOTE_NONE,dtype=str, na_values='', keep_default_na=False)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-07T16:36:52.304783Z", + "start_time": "2024-03-07T16:35:57.203517Z" + } + }, + "id": "f55e34a990ae89a9", + "execution_count": 50 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "compare = datacompy.Compare(\n", + " df1,\n", + " df2,\n", + " join_columns=['Identification nationale PP',\n", + " 'Code profession',\n", + " 'Code catégorie professionnelle',\n", + " 'Code type savoir-faire',\n", + " 'Code savoir-faire',\n", + " 'Code mode exercice', \n", + " 'Numéro SIRET site', \n", + " 'Numéro FINESS site',\n", + " 'Identifiant technique de la structure',\n", + " 'Code rôle'], #You can also specify a list of columns\n", + " abs_tol=0, #Optional, defaults to 0\n", + " rel_tol=0, #Optional, defaults to 0\n", + " df1_name='Original', #Optional, defaults to 'df1'\n", + " df2_name='New' #Optional, defaults to 'df2'\n", + " )" + ], + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-03-07T16:49:52.268885Z", + "start_time": "2024-03-07T16:45:38.231325Z" + } + }, + "id": "initial_id", + "execution_count": 59 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "mismatch = compare.all_mismatch(ignore_matching_cols=True)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-07T16:50:11.544899Z", + "start_time": "2024-03-07T16:49:52.305809Z" + } + }, + "id": "2f16ab257397f6c9", + "execution_count": 60 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "(117966, 82)" + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mismatch.shape" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-07T16:50:11.571499Z", + "start_time": "2024-03-07T16:50:11.547923Z" + } + }, + "id": "7b85bbc3f923fe64", + "execution_count": 61 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": " identification nationale pp code profession \\\n247 0010002616 72 \n1604 0019101898 91 \n1628 0019102417 91 \n2010 0019300995 93 \n2270 0019303460 93 \n... ... ... \n2217837 810109452887 99 \n2217838 810109452895 99 \n2217850 810109453018 99 \n2217851 810109453026 99 \n2217856 810109453075 99 \n\n code catégorie professionnelle code type savoir-faire \\\n247 C NaN \n1604 C NaN \n1628 C NaN \n2010 C NaN \n2270 C NaN \n... ... ... \n2217837 C NaN \n2217838 C NaN \n2217850 C NaN \n2217851 C NaN \n2217856 C NaN \n\n code savoir-faire code mode exercice numéro siret site \\\n247 NaN S 81157144700010 \n1604 NaN S 77567246200345 \n1628 NaN L NaN \n2010 NaN S 77554456200611 \n2270 NaN S 77554456200116 \n... ... ... ... \n2217837 NaN S NaN \n2217838 NaN S NaN \n2217850 NaN S 81899316400016 \n2217851 NaN S 82459575500011 \n2217856 NaN S NaN \n\n numéro finess site identifiant technique de la structure code rôle \\\n247 010780203 F010780203 NaN \n1604 010780609 F010780609 NaN \n1628 NaN C01910241700 FON-01 \n2010 010784262 F010784262 NaN \n2270 010780591 F010780591 NaN \n... ... ... ... \n2217837 970404372 F970404372 312 \n2217838 440015857 F44001585701042021 317 \n2217850 770024271 F770024271 312 \n2217851 690043179 F690043179 312 \n2217856 190007500 F19000750001102007 317 \n\n ... code section tableau pharmaciens_df1 \\\n247 ... NaN \n1604 ... NaN \n1628 ... NaN \n2010 ... NaN \n2270 ... NaN \n... ... ... \n2217837 ... NaN \n2217838 ... NaN \n2217850 ... NaN \n2217851 ... NaN \n2217856 ... NaN \n\n code section tableau pharmaciens_df2 \\\n247 NaN \n1604 NaN \n1628 NaN \n2010 NaN \n2270 NaN \n... ... \n2217837 NaN \n2217838 NaN \n2217850 NaN \n2217851 NaN \n2217856 NaN \n\n libellé section tableau pharmaciens_df1 \\\n247 NaN \n1604 NaN \n1628 NaN \n2010 NaN \n2270 NaN \n... ... \n2217837 NaN \n2217838 NaN \n2217850 NaN \n2217851 NaN \n2217856 NaN \n\n libellé section tableau pharmaciens_df2 \\\n247 NaN \n1604 NaN \n1628 NaN \n2010 NaN \n2270 NaN \n... ... \n2217837 NaN \n2217838 NaN \n2217850 NaN \n2217851 NaN \n2217856 NaN \n\n libellé rôle_df1 \\\n247 NaN \n1604 NaN \n1628 Titulaire de cabinet \n2010 NaN \n2270 NaN \n... ... \n2217837 Autre professionnel \n2217838 Préparateur en pharmacie (officine) \n2217850 Autre professionnel \n2217851 Autre professionnel \n2217856 Préparateur en pharmacie (officine) \n\n libellé rôle_df2 code genre activité_df1 \\\n247 NaN NaN \n1604 NaN NaN \n1628 Titulaire de cabinet NaN \n2010 NaN NaN \n2270 NaN NaN \n... ... ... \n2217837 Autre professionnel GENR12 \n2217838 Préparateur en pharmacie (officine) GENR01 \n2217850 Autre professionnel GENR08 \n2217851 Autre professionnel GENR12 \n2217856 Préparateur en pharmacie (officine) GENR01 \n\n code genre activité_df2 \\\n247 NaN \n1604 NaN \n1628 NaN \n2010 NaN \n2270 NaN \n... ... \n2217837 GENR12 \n2217838 GENR01 \n2217850 GENR08 \n2217851 GENR12 \n2217856 GENR01 \n\n libellé genre activité_df1 \\\n247 NaN \n1604 NaN \n1628 NaN \n2010 NaN \n2270 NaN \n... ... \n2217837 Encadrement et orga accompagnt social/médico-soc \n2217838 Activité standard de soin ou de pharmacien \n2217850 Activité de coordination et d'orientation \n2217851 Encadrement et orga accompagnt social/médico-soc \n2217856 Activité standard de soin ou de pharmacien \n\n libellé genre activité_df2 \n247 NaN \n1604 NaN \n1628 NaN \n2010 NaN \n2270 NaN \n... ... \n2217837 Encadrement et orga accompagnt social/médico-soc \n2217838 Activité standard de soin ou de pharmacien \n2217850 Activité de coordination et d'orientation \n2217851 Encadrement et orga accompagnt social/médico-soc \n2217856 Activité standard de soin ou de pharmacien \n\n[117966 rows x 82 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
identification nationale ppcode professioncode catégorie professionnellecode type savoir-fairecode savoir-fairecode mode exercicenuméro siret sitenuméro finess siteidentifiant technique de la structurecode rôle...code section tableau pharmaciens_df1code section tableau pharmaciens_df2libellé section tableau pharmaciens_df1libellé section tableau pharmaciens_df2libellé rôle_df1libellé rôle_df2code genre activité_df1code genre activité_df2libellé genre activité_df1libellé genre activité_df2
247001000261672CNaNNaNS81157144700010010780203F010780203NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1604001910189891CNaNNaNS77567246200345010780609F010780609NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1628001910241791CNaNNaNLNaNNaNC01910241700FON-01...NaNNaNNaNNaNTitulaire de cabinetTitulaire de cabinetNaNNaNNaNNaN
2010001930099593CNaNNaNS77554456200611010784262F010784262NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2270001930346093CNaNNaNS77554456200116010780591F010780591NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
221783781010945288799CNaNNaNSNaN970404372F970404372312...NaNNaNNaNNaNAutre professionnelAutre professionnelGENR12GENR12Encadrement et orga accompagnt social/médico-socEncadrement et orga accompagnt social/médico-soc
221783881010945289599CNaNNaNSNaN440015857F44001585701042021317...NaNNaNNaNNaNPréparateur en pharmacie (officine)Préparateur en pharmacie (officine)GENR01GENR01Activité standard de soin ou de pharmacienActivité standard de soin ou de pharmacien
221785081010945301899CNaNNaNS81899316400016770024271F770024271312...NaNNaNNaNNaNAutre professionnelAutre professionnelGENR08GENR08Activité de coordination et d'orientationActivité de coordination et d'orientation
221785181010945302699CNaNNaNS82459575500011690043179F690043179312...NaNNaNNaNNaNAutre professionnelAutre professionnelGENR12GENR12Encadrement et orga accompagnt social/médico-socEncadrement et orga accompagnt social/médico-soc
221785681010945307599CNaNNaNSNaN190007500F19000750001102007317...NaNNaNNaNNaNPréparateur en pharmacie (officine)Préparateur en pharmacie (officine)GENR01GENR01Activité standard de soin ou de pharmacienActivité standard de soin ou de pharmacien
\n

117966 rows × 82 columns

\n
" + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mismatch" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-07T16:50:11.736073Z", + "start_time": "2024-03-07T16:50:11.584819Z" + } + }, + "id": "9331dbcc33b567fd", + "execution_count": 62 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "(234567, 57)" + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "compare.df1_unq_rows.shape" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-07T16:50:11.747805Z", + "start_time": "2024-03-07T16:50:11.739092Z" + } + }, + "id": "f38ecf439538fc9b", + "execution_count": 63 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "(255343, 57)" + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "compare.df2_unq_rows.shape" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-07T16:50:11.765984Z", + "start_time": "2024-03-07T16:50:11.752578Z" + } + }, + "id": "b0a4c80da0847ac0", + "execution_count": 64 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "(1976330, 57)" + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.shape" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-07T16:50:11.783138Z", + "start_time": "2024-03-07T16:50:11.769029Z" + } + }, + "id": "b9aa33151fa6f235", + "execution_count": 65 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": "(1997106, 57)" + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.shape" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-07T16:50:11.795092Z", + "start_time": "2024-03-07T16:50:11.785223Z" + } + }, + "id": "aaa69421db146ed7", + "execution_count": 66 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/datacompyTest.ipynb b/datacompyTest.ipynb new file mode 100644 index 0000000..685b896 --- /dev/null +++ b/datacompyTest.ipynb @@ -0,0 +1,204 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 23, + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-01-23T18:53:49.676160800Z", + "start_time": "2024-01-23T18:53:49.620035200Z" + } + }, + "outputs": [], + "source": [ + "\n", + "from io import StringIO\n", + "import pandas as pd\n", + "import datacompy\n", + "\n", + "data1 = \"\"\"acct_id,dollar_amt,name,float_fld,date_fld\n", + "10000001234,123.45,George Maharis,14530.1555,2017-01-01\n", + "10000001235,0.45,Michael Bluth,1,2017-01-01\n", + "10000001236,1345,George Bluth,,2017-01-01\n", + "10000001237,123456,Bob Loblaw,345.12,2017-01-01\n", + "10000001237,123457,Bob Loblaw,345.12,2017-01-01\n", + "10000001239,1.05,Lucille Bluth,,2017-01-01\n", + "\"\"\"\n", + "\n", + "data2 = \"\"\"acct_id,dollar_amt,name,float_fld\n", + "10000001234,123.4,George Michael Bluth,14530.155\n", + "10000001235,0.45,Michael Bluth,\n", + "10000001236,1345,George Bluth,1\n", + "10000001237,123456,Robert Loblaw,345.12\n", + "10000001238,1.05,Loose Seal Bluth,111\n", + "\"\"\"\n", + "\n", + "df1 = pd.read_csv(StringIO(data1))\n", + "df2 = pd.read_csv(StringIO(data2))\n", + "\n", + "compare = datacompy.Compare(\n", + " df1,\n", + " df2,\n", + " join_columns='acct_id', #You can also specify a list of columns\n", + " abs_tol=0, #Optional, defaults to 0\n", + " rel_tol=0, #Optional, defaults to 0\n", + " df1_name='Original', #Optional, defaults to 'df1'\n", + " df2_name='New' #Optional, defaults to 'df2'\n", + " )\n" + ] + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": " acct_id dollar_amt_df1 dollar_amt_df2 name_df1 \\\n0 10000001234 123.45 123.40 George Maharis \n1 10000001235 0.45 0.45 Michael Bluth \n2 10000001236 1345.00 1345.00 George Bluth \n3 10000001237 123456.00 123456.00 Bob Loblaw \n\n name_df2 float_fld_df1 float_fld_df2 \n0 George Michael Bluth 14530.1555 14530.155 \n1 Michael Bluth 1.0000 NaN \n2 George Bluth NaN 1.000 \n3 Robert Loblaw 345.1200 345.120 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
acct_iddollar_amt_df1dollar_amt_df2name_df1name_df2float_fld_df1float_fld_df2
010000001234123.45123.40George MaharisGeorge Michael Bluth14530.155514530.155
1100000012350.450.45Michael BluthMichael Bluth1.0000NaN
2100000012361345.001345.00George BluthGeorge BluthNaN1.000
310000001237123456.00123456.00Bob LoblawRobert Loblaw345.1200345.120
\n
" + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "compare.all_mismatch(ignore_matching_cols=True)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-23T18:53:56.135115400Z", + "start_time": "2024-01-23T18:53:56.086349900Z" + } + }, + "id": "2f16ab257397f6c9", + "execution_count": 24 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": " acct_id dollar_amt name float_fld date_fld\n4 10000001237 123457.00 Bob Loblaw 345.12 2017-01-01\n5 10000001239 1.05 Lucille Bluth NaN 2017-01-01", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
acct_iddollar_amtnamefloat_flddate_fld
410000001237123457.00Bob Loblaw345.122017-01-01
5100000012391.05Lucille BluthNaN2017-01-01
\n
" + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "compare.df1_unq_rows" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-23T18:53:59.793951800Z", + "start_time": "2024-01-23T18:53:59.751624300Z" + } + }, + "id": "f38ecf439538fc9b", + "execution_count": 25 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": " acct_id dollar_amt name float_fld\n6 10000001238 1.05 Loose Seal Bluth 111.0", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
acct_iddollar_amtnamefloat_fld
6100000012381.05Loose Seal Bluth111.0
\n
" + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "compare.df2_unq_rows" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-23T18:54:20.805047600Z", + "start_time": "2024-01-23T18:54:20.777818600Z" + } + }, + "id": "b0a4c80da0847ac0", + "execution_count": 26 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": " acct_id dollar_amt name float_fld date_fld\n0 10000001234 123.45 George Maharis 14530.1555 2017-01-01\n1 10000001235 0.45 Michael Bluth 1.0000 2017-01-01\n2 10000001236 1345.00 George Bluth NaN 2017-01-01\n3 10000001237 123456.00 Bob Loblaw 345.1200 2017-01-01\n4 10000001237 123457.00 Bob Loblaw 345.1200 2017-01-01\n5 10000001239 1.05 Lucille Bluth NaN 2017-01-01", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
acct_iddollar_amtnamefloat_flddate_fld
010000001234123.45George Maharis14530.15552017-01-01
1100000012350.45Michael Bluth1.00002017-01-01
2100000012361345.00George BluthNaN2017-01-01
310000001237123456.00Bob Loblaw345.12002017-01-01
410000001237123457.00Bob Loblaw345.12002017-01-01
5100000012391.05Lucille BluthNaN2017-01-01
\n
" + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-23T18:54:25.595365100Z", + "start_time": "2024-01-23T18:54:25.533925200Z" + } + }, + "id": "b9aa33151fa6f235", + "execution_count": 27 + }, + { + "cell_type": "code", + "outputs": [ + { + "data": { + "text/plain": " acct_id dollar_amt name float_fld\n0 10000001234 123.40 George Michael Bluth 14530.155\n1 10000001235 0.45 Michael Bluth NaN\n2 10000001236 1345.00 George Bluth 1.000\n3 10000001237 123456.00 Robert Loblaw 345.120\n4 10000001238 1.05 Loose Seal Bluth 111.000", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
acct_iddollar_amtnamefloat_fld
010000001234123.40George Michael Bluth14530.155
1100000012350.45Michael BluthNaN
2100000012361345.00George Bluth1.000
310000001237123456.00Robert Loblaw345.120
4100000012381.05Loose Seal Bluth111.000
\n
" + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-23T18:54:28.672000100Z", + "start_time": "2024-01-23T18:54:28.631719300Z" + } + }, + "id": "aaa69421db146ed7", + "execution_count": 28 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/prepareProfessionalsTable.py b/prepareProfessionalsTable.py new file mode 100644 index 0000000..3e7fa88 --- /dev/null +++ b/prepareProfessionalsTable.py @@ -0,0 +1,164 @@ +import argparse +import csv +import math +import sys +from os import path +from shutil import copyfileobj +from zipfile import ZipFile, is_zipfile +import numpy as np +import pandas as pd +import requests +from tqdm import tqdm # could use from tqdm.gui import tqdm +from tqdm.utils import CallbackIOWrapper +from urllib3.exceptions import InsecureRequestWarning +from urllib3 import disable_warnings +import questionary + + +def process_professionals_table(xls_file, txt_file, output_file): + # Load Excel Dataframes + xls = pd.read_excel(xls_file, sheet_name=None, dtype=str, + na_values='', keep_default_na=False) + professions = xls['F_Professions']['Professions'].tolist() + + # CSV Progressbar initialisation + estimated_total_rows = sum(1 for _ in open(txt_file, 'rb')) - 1 + chunk_size = 20000 + + # Iterating over CSV file + columns_to_clean = np.r_[0, 2, 4:7, 9, 11:16, 17:28, 30, 35:40, 41:56] + with tqdm(total=estimated_total_rows, desc=f'Writing to {path.basename(output_file)}', + leave=True, unit="Ln") as bar: + for i, df in enumerate(pd.read_csv(txt_file, sep='|', doublequote=False, quoting=csv.QUOTE_NONE, + dtype=str, na_values='', keep_default_na=False, chunksize=chunk_size)): + n_rows = df.shape[0] + df.iloc[:, columns_to_clean] = '' + df = df[df['Libellé profession'].isin(professions)] + + if i == 0: + df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE, + lineterminator='\n') + else: + df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE, + lineterminator='\n', header=False, mode='a') + + bar.update(n_rows) + bar.close() + + # Appending Other xls tabs + df = pd.concat([df[:0], xls['F_Append_Update'], xls['F_Etrangers'], + xls['F_Fake'], xls['F_Sophrologues']], ignore_index=True) + df.iloc[:, columns_to_clean] = '' + df = df[df['Libellé profession'].isin(professions)] + df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE, + lineterminator='\n', header=False, mode='a') + + +def download_file(url: str, filename: str = False) -> object: + if not filename: + local_filename = path.join(".", url.split('/')[-1]) + else: + local_filename = filename + disable_warnings(InsecureRequestWarning) + r = requests.get(url, stream=True, verify=False) + file_size = int(r.headers['Content-Length']) + unit_scale = 64 + + with open(local_filename, 'wb') as fp: + for chunk in tqdm(r.iter_content(chunk_size=unit_scale * 1024), + total=math.ceil(file_size / 1024 / unit_scale), + unit_scale=unit_scale, + unit='KB', + desc=f"Downloading to {path.basename(local_filename)}", + leave=True): + fp.write(chunk) + return + + +def extract_one_file_from_zip(zipfile, fromfile, tofile, desc=False): + if not desc: + desc = f"Extracting to {path.basename(tofile)}" + file = None + if not is_zipfile(zipfile): + return f"Can't open Zipfile (non existent or bad): {zipfile}" + zipf = ZipFile(zipfile) + for f in zipf.infolist(): + if getattr(f, "filename", "").startswith(fromfile): + file = f + break + if file is None: + return f"No such file name in the Zip ({fromfile}*)..." + + with zipf, tqdm( + desc=desc, unit="B", unit_scale=True, unit_divisor=1024, + total=getattr(file, "file_size", 0), leave=True, + ) as pbar: + with zipf.open(file) as fi, open(tofile, "wb") as fo: + copyfileobj(CallbackIOWrapper(pbar.update, fi), fo) + pbar.close() + + +def main(): + defaultFileName = 'Table_Réf_Professionnels' + defaultExcelFileName = 'Table_Réf_Professionnels' + internalFileName = 'PS_LibreAcces_Personne_activite' + + parser = argparse.ArgumentParser(description='Prepare Professionals Table for Import to Endoziwig.') + parser.add_argument('fileName', type=str, nargs='?', default=defaultFileName, + help=f'File name to use : default="{defaultFileName}"') + parser.add_argument('--excelFileName', '-x', type=str, nargs='?', default=defaultExcelFileName, + help=f'Excel File Containing Append Data: default="{defaultExcelFileName}" (without extension)') + parser.add_argument('--noDownload', '-ndw', action='store_true', + help='Do not Download the file (Default = Download).') + parser.add_argument('--noUnzip', '-nuz', action='store_true', + help='Do not Unzip the file (Default = Unzip).') + parser.add_argument('--noProcess', '-npr', action='store_true', + help='Do not Process the file (Default = Process).') + + args = parser.parse_args() + + if len(sys.argv) == 1: + print("You're about to download and prepare Professionals Table for import to Endoziwig") + + # Files Settings + if args.fileName == defaultFileName: + print("\n") + args.fileName = questionary.text("Please confirm file name (or empty to cancel):", + default=defaultFileName).ask() + if args.fileName == '': + sys.exit(0) + + BASE_DIR = path.dirname(path.abspath(__file__)) + zipFileName = path.join(BASE_DIR, f'{args.fileName}.zip') + xlsFileName = path.join(BASE_DIR, f'{args.excelFileName}.xlsx') + txtFileName = path.join(BASE_DIR, f'{args.fileName}.txt') + outputFileName = path.join(BASE_DIR, f'{args.fileName}.csv') + + print("\n") + + if not args.noDownload: + download_file( + 'https://service.annuaire.sante.fr/annuaire-sante-webservices/V300/services/extraction/PS_LibreAcces', + filename=zipFileName) + print("\n") + + if not args.noUnzip: + unzipResult = extract_one_file_from_zip(zipFileName, internalFileName, txtFileName) + if unzipResult is not None: + print(unzipResult) + print("\n") + + if not args.noProcess: + process_professionals_table(xlsFileName, txtFileName, outputFileName) + print("\n") + + +if __name__ == '__main__': + try : + main() + except(Exception) as e : + print(e) + finally : + input('Finished... Press Enter to continue') + print('\n') + \ No newline at end of file