{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "58db5082e27759f7", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:45:53.794236Z", "start_time": "2025-08-19T22:45:51.445477Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 5.73 s\n", "Wall time: 511 ms\n" ] }, { "data": { "text/plain": [ "['index',\n", " \"Type d'identifiant PP\",\n", " 'Identifiant PP',\n", " 'Identification nationale PP',\n", " \"Code civilité d'exercice\",\n", " \"Libellé civilité d'exercice\",\n", " 'Code civilité',\n", " 'Libellé civilité',\n", " \"Nom d'exercice\",\n", " \"Prénom d'exercice\",\n", " 'Code profession',\n", " 'Libellé profession',\n", " 'Code catégorie professionnelle',\n", " 'Libellé catégorie professionnelle',\n", " 'Code type savoir-faire',\n", " 'Libellé type savoir-faire',\n", " 'Code savoir-faire',\n", " 'Libellé savoir-faire',\n", " 'Code mode exercice',\n", " 'Libellé mode exercice',\n", " 'Numéro SIRET site',\n", " 'Numéro SIREN site',\n", " 'Numéro FINESS site',\n", " 'Numéro FINESS établissement juridique',\n", " 'Identifiant technique de la structure',\n", " 'Raison sociale site',\n", " 'Enseigne commerciale site',\n", " 'Complément destinataire (coord. structure)',\n", " 'Complément point géographique (coord. structure)',\n", " 'Numéro Voie (coord. structure)',\n", " 'Indice répétition voie (coord. structure)',\n", " 'Code type de voie (coord. structure)',\n", " 'Libellé type de voie (coord. structure)',\n", " 'Libellé Voie (coord. structure)',\n", " 'Mention distribution (coord. structure)',\n", " 'Bureau cedex (coord. structure)',\n", " 'Code postal (coord. structure)',\n", " 'Code commune (coord. structure)',\n", " 'Libellé commune (coord. structure)',\n", " 'Code pays (coord. structure)',\n", " 'Libellé pays (coord. structure)',\n", " 'Téléphone (coord. structure)',\n", " 'Téléphone 2 (coord. structure)',\n", " 'Télécopie (coord. structure)',\n", " 'Adresse e-mail (coord. structure)',\n", " 'Code Département (structure)',\n", " 'Libellé Département (structure)',\n", " 'Ancien identifiant de la structure',\n", " \"Autorité d'enregistrement\",\n", " \"Code secteur d'activité\",\n", " \"Libellé secteur d'activité\",\n", " 'Code section tableau pharmaciens',\n", " 'Libellé section tableau pharmaciens',\n", " 'Code rôle',\n", " 'Libellé rôle',\n", " 'Code genre activité',\n", " 'Libellé genre activité']" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "import polars as pd\n", "import csv\n", "folder = \"E:\\\\Ziwig Drive\\\\Ziwig Health\\\\Tables de Réf\\\\Professionnels\\\\\"\n", "input_file = \"Table_Réf_Professionnels_260319.txt\"\n", "output_file = \"Table_Réf_Professionnels_inconsistencies\"\n", "output_extension = \".csv\"\n", "df = pd.read_csv(f\"{folder}{input_file}\",\n", " separator='|',\n", " quote_char=None,\n", " null_values='',\n", " infer_schema_length=0) # Read all columns as strings\n", "df = df.with_row_index('index')\n", "df.columns\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "7d9b7562c09955", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:45:54.952210Z", "start_time": "2025-08-19T22:45:53.873718Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 2.03 s\n", "Wall time: 389 ms\n" ] }, { "data": { "text/html": [ "
\n", "shape: (10_110, 5)
indexIdentifiant PPCountNom d'exercicePrénom d'exercice
u32stru32strstr
1393753"10000034180"2"DUWAT-GEORGES""GHISLAINE"
1393754"10000034180"2"GEORGES""GHISLAINE"
262"10000040062"2"MEYER""Nicolas"
263"10000040062"2"MEYER""Nicolas"
835639"10000040062"2"MEYER""NICOLAS"
1947864"10111726807"2"BARES""Valérie"
275095"10111753363"2"BOUZIDI""MOHAND"
275096"10111753363"2"BOUZIDI""Mohand"
1112006"10111761572"2"BOVENS""Brice"
1669178"10111761572"2"BOVENS""BRICE"
" ], "text/plain": [ "shape: (10_110, 5)\n", "┌─────────┬────────────────┬───────┬────────────────┬───────────────────┐\n", "│ index ┆ Identifiant PP ┆ Count ┆ Nom d'exercice ┆ Prénom d'exercice │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ str ┆ u32 ┆ str ┆ str │\n", "╞═════════╪════════════════╪═══════╪════════════════╪═══════════════════╡\n", "│ 1393753 ┆ 10000034180 ┆ 2 ┆ DUWAT-GEORGES ┆ GHISLAINE │\n", "│ 1393754 ┆ 10000034180 ┆ 2 ┆ GEORGES ┆ GHISLAINE │\n", "│ 262 ┆ 10000040062 ┆ 2 ┆ MEYER ┆ Nicolas │\n", "│ 263 ┆ 10000040062 ┆ 2 ┆ MEYER ┆ Nicolas │\n", "│ 835639 ┆ 10000040062 ┆ 2 ┆ MEYER ┆ NICOLAS │\n", "│ … ┆ … ┆ … ┆ … ┆ … │\n", "│ 1947864 ┆ 10111726807 ┆ 2 ┆ BARES ┆ Valérie │\n", "│ 275095 ┆ 10111753363 ┆ 2 ┆ BOUZIDI ┆ MOHAND │\n", "│ 275096 ┆ 10111753363 ┆ 2 ┆ BOUZIDI ┆ Mohand │\n", "│ 1112006 ┆ 10111761572 ┆ 2 ┆ BOVENS ┆ Brice │\n", "│ 1669178 ┆ 10111761572 ┆ 2 ┆ BOVENS ┆ BRICE │\n", "└─────────┴────────────────┴───────┴────────────────┴───────────────────┘" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df1 = (\n", " df.with_columns(\n", " (pd.col(\"Nom d'exercice\") + \" \" + pd.col(\"Prénom d'exercice\")).alias(\"Nom_Prénom\")\n", " )\n", " .with_columns(\n", " pd.col(\"Nom_Prénom\").n_unique().over(\"Identifiant PP\").alias(\"Count\")\n", " )\n", " .filter(pd.col(\"Count\") > 1)\n", " .sort([\"Identifiant PP\", \"index\"])\n", " .select(\"index\", \"Identifiant PP\", \"Count\", \"Nom d'exercice\", \"Prénom d'exercice\")\n", ")\n", "df1\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "c418a6ea7abd77b", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:45:55.090712Z", "start_time": "2025-08-19T22:45:55.072647Z" } }, "outputs": [], "source": [ "df1.write_csv(f\"{folder}{output_file}-Names_Variations_Strict{output_extension}\",\n", " separator='|',\n", " quote_style=\"never\",\n", " line_terminator='\\n')\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "9d94b716364356c7", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:45:56.962873Z", "start_time": "2025-08-19T22:45:55.259223Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 2.31 s\n", "Wall time: 509 ms\n" ] }, { "data": { "text/html": [ "
\n", "shape: (5_847, 5)
indexIdentifiant PPCountNom d'exercicePrénom d'exercice
u32stru32strstr
1393753"10000034180"2"DUWAT-GEORGES""GHISLAINE"
1393754"10000034180"2"GEORGES""GHISLAINE"
1393833"10000046051"2"STUDER""AGNES"
1672210"10000046051"2"JURION""AGNES"
278448"10000101518"2"BARREYRE""SANDRINE"
2224008"10111558895"2"QUIROGA TENORIO DE CARVALHO""Rafaela"
831407"10111667787"2"ANDY""MARIE MORGANE"
1389569"10111667787"2"BARDIL""Morgane"
274737"10111726807"2"COURET BARES""Valérie"
1947864"10111726807"2"BARES""Valérie"
" ], "text/plain": [ "shape: (5_847, 5)\n", "┌─────────┬────────────────┬───────┬─────────────────────────────┬───────────────────┐\n", "│ index ┆ Identifiant PP ┆ Count ┆ Nom d'exercice ┆ Prénom d'exercice │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ str ┆ u32 ┆ str ┆ str │\n", "╞═════════╪════════════════╪═══════╪═════════════════════════════╪═══════════════════╡\n", "│ 1393753 ┆ 10000034180 ┆ 2 ┆ DUWAT-GEORGES ┆ GHISLAINE │\n", "│ 1393754 ┆ 10000034180 ┆ 2 ┆ GEORGES ┆ GHISLAINE │\n", "│ 1393833 ┆ 10000046051 ┆ 2 ┆ STUDER ┆ AGNES │\n", "│ 1672210 ┆ 10000046051 ┆ 2 ┆ JURION ┆ AGNES │\n", "│ 278448 ┆ 10000101518 ┆ 2 ┆ BARREYRE ┆ SANDRINE │\n", "│ … ┆ … ┆ … ┆ … ┆ … │\n", "│ 2224008 ┆ 10111558895 ┆ 2 ┆ QUIROGA TENORIO DE CARVALHO ┆ Rafaela │\n", "│ 831407 ┆ 10111667787 ┆ 2 ┆ ANDY ┆ MARIE MORGANE │\n", "│ 1389569 ┆ 10111667787 ┆ 2 ┆ BARDIL ┆ Morgane │\n", "│ 274737 ┆ 10111726807 ┆ 2 ┆ COURET BARES ┆ Valérie │\n", "│ 1947864 ┆ 10111726807 ┆ 2 ┆ BARES ┆ Valérie │\n", "└─────────┴────────────────┴───────┴─────────────────────────────┴───────────────────┘" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df2 = (\n", " df.with_columns(\n", " (pd.col(\"Nom d'exercice\").str.to_lowercase() + \" \" + pd.col(\"Prénom d'exercice\").str.to_lowercase()).alias(\"Nom_Prénom\")\n", " )\n", " .with_columns(\n", " pd.col(\"Nom_Prénom\").n_unique().over(\"Identifiant PP\").alias(\"Count\")\n", " )\n", " .filter(pd.col(\"Count\") > 1)\n", " .sort([\"Identifiant PP\", \"index\"])\n", " .select(\"index\", \"Identifiant PP\", \"Count\", \"Nom d'exercice\", \"Prénom d'exercice\")\n", ")\n", "df2\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "18aab4499103491a", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:45:57.433036Z", "start_time": "2025-08-19T22:45:57.417970Z" } }, "outputs": [], "source": [ "df2.write_csv(f\"{folder}{output_file}-Names_Variations_Insensitive{output_extension}\",\n", " separator='|',\n", " quote_style=\"never\",\n", " line_terminator='\\n')\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "8e4e3e22f16fea1c", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:46:02.915526Z", "start_time": "2025-08-19T22:45:57.710258Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 3.19 s\n", "Wall time: 1.42 s\n" ] }, { "data": { "text/html": [ "
\n", "shape: (3_894, 6)
indexIdentifiant PPCountNom d'exercicePrénom d'exerciceNom_Prénom_Nettoyé
u32stru32strstrstr
1393753"10000034180"2"DUWAT-GEORGES""GHISLAINE""duwat georges ghislaine"
1393754"10000034180"2"GEORGES""GHISLAINE""georges ghislaine"
1393833"10000046051"2"STUDER""AGNES""studer agnes"
1672210"10000046051"2"JURION""AGNES""jurion agnes"
278448"10000101518"2"BARREYRE""SANDRINE""barreyre sandrine"
2224008"10111558895"2"QUIROGA TENORIO DE CARVALHO""Rafaela""quiroga tenorio de carvalho ra…
831407"10111667787"2"ANDY""MARIE MORGANE""andy marie morgane"
1389569"10111667787"2"BARDIL""Morgane""bardil morgane"
274737"10111726807"2"COURET BARES""Valérie""couret bares valerie"
1947864"10111726807"2"BARES""Valérie""bares valerie"
" ], "text/plain": [ "shape: (3_894, 6)\n", "┌─────────┬────────────────┬───────┬─────────────────────┬───────────────────┬─────────────────────┐\n", "│ index ┆ Identifiant PP ┆ Count ┆ Nom d'exercice ┆ Prénom d'exercice ┆ Nom_Prénom_Nettoyé │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n", "╞═════════╪════════════════╪═══════╪═════════════════════╪═══════════════════╪═════════════════════╡\n", "│ 1393753 ┆ 10000034180 ┆ 2 ┆ DUWAT-GEORGES ┆ GHISLAINE ┆ duwat georges │\n", "│ ┆ ┆ ┆ ┆ ┆ ghislaine │\n", "│ 1393754 ┆ 10000034180 ┆ 2 ┆ GEORGES ┆ GHISLAINE ┆ georges ghislaine │\n", "│ 1393833 ┆ 10000046051 ┆ 2 ┆ STUDER ┆ AGNES ┆ studer agnes │\n", "│ 1672210 ┆ 10000046051 ┆ 2 ┆ JURION ┆ AGNES ┆ jurion agnes │\n", "│ 278448 ┆ 10000101518 ┆ 2 ┆ BARREYRE ┆ SANDRINE ┆ barreyre sandrine │\n", "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", "│ 2224008 ┆ 10111558895 ┆ 2 ┆ QUIROGA TENORIO DE ┆ Rafaela ┆ quiroga tenorio de │\n", "│ ┆ ┆ ┆ CARVALHO ┆ ┆ carvalho ra… │\n", "│ 831407 ┆ 10111667787 ┆ 2 ┆ ANDY ┆ MARIE MORGANE ┆ andy marie morgane │\n", "│ 1389569 ┆ 10111667787 ┆ 2 ┆ BARDIL ┆ Morgane ┆ bardil morgane │\n", "│ 274737 ┆ 10111726807 ┆ 2 ┆ COURET BARES ┆ Valérie ┆ couret bares │\n", "│ ┆ ┆ ┆ ┆ ┆ valerie │\n", "│ 1947864 ┆ 10111726807 ┆ 2 ┆ BARES ┆ Valérie ┆ bares valerie │\n", "└─────────┴────────────────┴───────┴─────────────────────┴───────────────────┴─────────────────────┘" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df3 = (\n", " df\n", " .with_columns(\n", " (\n", " (pd.col(\"Nom d'exercice\") + \" \" + pd.col(\"Prénom d'exercice\"))\n", " .str.to_lowercase()\n", " # Normalisation des accents\n", " .str.replace_all(\"à|á|â|ã|ä|å\", \"a\", literal=False)\n", " .str.replace_all(\"ç\", \"c\", literal=False)\n", " .str.replace_all(\"è|é|ê|ë\", \"e\", literal=False)\n", " .str.replace_all(\"ì|í|î|ï\", \"i\", literal=False)\n", " .str.replace_all(\"ñ\", \"n\", literal=False)\n", " .str.replace_all(\"ò|ó|ô|õ|ö\", \"o\", literal=False)\n", " .str.replace_all(\"ù|ú|û|ü\", \"u\", literal=False)\n", " .str.replace_all(\"ý|ÿ\", \"y\", literal=False)\n", " # Remplacement des caractères non-alphanumériques et nettoyage des espaces\n", " .str.replace_all(r\"[^a-z0-9\\\\s]\", \" \", literal=False)\n", " .str.replace_all(r\"\\\\s+\", \" \", literal=False)\n", " .str.strip_chars()\n", " ).alias(\"Nom_Prénom_Nettoyé\")\n", " )\n", " .with_columns(\n", " pd.col(\"Nom_Prénom_Nettoyé\").n_unique().over(\"Identifiant PP\").alias(\"Count\")\n", " )\n", " .filter(pd.col(\"Count\") > 1)\n", " .sort([\"Identifiant PP\", \"index\"])\n", " .select(\"index\", \"Identifiant PP\", \"Count\", \"Nom d'exercice\", \"Prénom d'exercice\", \"Nom_Prénom_Nettoyé\")\n", ")\n", "df3\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "aab2ae2e91a7190c", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:46:03.290835Z", "start_time": "2025-08-19T22:46:03.280259Z" } }, "outputs": [], "source": [ "df3.write_csv(f\"{folder}{output_file}-Names_Variations_Normalized{output_extension}\",\n", " separator='|',\n", " quote_style=\"never\",\n", " line_terminator='\\n')\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "3c2f2bb5fc3c2a5e", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:46:07.814563Z", "start_time": "2025-08-19T22:46:03.493442Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 3.78 s\n", "Wall time: 645 ms\n" ] }, { "data": { "text/html": [ "
\n", "shape: (81, 31)
indexIdentifiant PPSite_IdentifierNuméro SIRET siteIdentifiant technique de la structureSite_Info_Variations_CountRaison sociale siteEnseigne commerciale siteComplément destinataire (coord. structure)Complément point géographique (coord. structure)Numéro Voie (coord. structure)Indice répétition voie (coord. structure)Code type de voie (coord. structure)Libellé type de voie (coord. structure)Libellé Voie (coord. structure)Mention distribution (coord. structure)Bureau cedex (coord. structure)Code postal (coord. structure)Code commune (coord. structure)Libellé commune (coord. structure)Code pays (coord. structure)Libellé pays (coord. structure)Téléphone (coord. structure)Téléphone 2 (coord. structure)Télécopie (coord. structure)Adresse e-mail (coord. structure)Code Département (structure)Libellé Département (structure)Ancien identifiant de la structureCode secteur d'activitéLibellé secteur d'activité
u32strstrstrstru32strstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstr
278582"10000116383""30980220500505""30980220500505""R10100000198782"2"OEUVRES HOSPITALIERES FRANCAIS…"ORDRE DE MALTE FRANCE""ORDRE DE MALTE FRANCE"null"42"nullnullnull"RUE DES VOLONTAIRES"null"75015 PARIS""75015""75056""Paris""99000""France"nullnullnullnullnullnull"330980220500505""SA28""Asso et orga humanitaire"
1394285"10000116383""30980220500505""30980220500505""R10100000779807"2"OEUVRE HOSP FRANC DE L'ORDRE D…nullnullnull"49"null"R""Rue""DE LA CHAPELLE"null"75018 PARIS 18E  ARRONDISSEMEN…"75018""75118""Paris 18e  Arrondissement"nullnullnullnullnullnullnullnull"330980220500505""SA28""Asso et orga humanitaire"
840759"10000667187""18003502402369""18003502402369""R10100000049794"3"DRSM NORD PICARDIEELSM 59""SITE MAUBEUGE"nullnullnullnull"PL""Place""DE WATTIGNIES""BP""59603 MAUBEUGE""59603""59392""Maubeuge"nullnullnullnullnullnullnullnull"318003502402369""SA24""Organisme de Sécurité Sociale"
1677281"10000667187""18003502402369""18003502402369""R10100000050000"3"DRSM NORD PICARDIEELSM 59""SITE VALENCIENNES"nullnull"2"null"PL""Place""DE LA REPUBLIQUE""BP""59304 VALENCIENNES""59304""59606""Valenciennes"nullnullnullnullnullnullnullnull"318003502402369""SA24""Organisme de Sécurité Sociale"
1956358"10000667187""18003502402369""18003502402369""R10100000049793"3"DRSM NORD PICARDIEELSM 59""SITE CAMBRAI"nullnull"10"null"R""Rue""SAINT LAZARE""BP""59403 CAMBRAI""59403""59122""Cambrai"nullnullnullnullnullnullnullnull"318003502402369""SA24""Organisme de Sécurité Sociale"
539023"10110592168""88085935000014""88085935000014""R10100000325887"2"FAREVA PAU"nullnullnullnullnull"AV""Avenue""DU BEARN"null"64320 IDRON""64320""64269""Idron"nullnull"0559402100"null"0559402119"nullnullnull"388085935000014""SA32""Fab. Exploit. Import. Méd. DM"
260720"10110597498""13000800600038""13000800600038""R10100000097229"2"AGENCE REGIONALE SANTE PAYS LO…null"DELEGATION TERRITORIALE"null"2"nullnullnull"BOULEVARD MURAT"null"53000 LAVAL""53000""53130""Laval"nullnullnullnullnullnullnullnull"313000800600038""SA24""Organisme de Sécurité Sociale"
1376341"10110597498""13000800600038""13000800600038""R10100000097229"2"AGENCE REGIONALE SANTE PAYS LO…nullnullnull"17"null"BD""Boulevard""GASTON DOUMERGUE"null"44262 NANTES""44262""44109""Nantes"nullnullnullnullnullnullnullnull"313000800600038""SA24""Organisme de Sécurité Sociale"
1658604"10110910345""18003502401098""18003502401098""R10100000398898"2"CAISSE NATIONALE DE L'ASSURANC…"DRSM DIRECTION REG. DU SERVICE…"QUARTIER DU LAC"null"80"nullnullnull"AVENUE DE LA JALLERE""BP 260""33300 BORDEAUX""33300""33063""Bordeaux""99000""France"nullnullnullnullnullnull"318003502401098""SA24""Organisme de Sécurité Sociale"
1937582"10110910345""18003502401098""18003502401098""R10100000398898"2"CAISSE NATIONALE DE L'ASSURANC…"DRSM DIRECTION REG. DU SERVICE…nullnull"207"null"R""Rue""FONTAINEBLEAU""BP""40011 MONT-DE-MARSAN""40011""40192""Mont-de-Marsan"nullnullnullnullnullnullnullnull"318003502401098""SA24""Organisme de Sécurité Sociale"
" ], "text/plain": [ "shape: (81, 31)\n", "┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", "│ index ┆ Identifian ┆ Site_Iden ┆ Numéro ┆ … ┆ Libellé ┆ Ancien ┆ Code ┆ Libellé │\n", "│ --- ┆ t PP ┆ tifier ┆ SIRET ┆ ┆ Départeme ┆ identifia ┆ secteur ┆ secteur │\n", "│ u32 ┆ --- ┆ --- ┆ site ┆ ┆ nt (struc ┆ nt de la ┆ d'activit ┆ d'activit │\n", "│ ┆ str ┆ str ┆ --- ┆ ┆ ture… ┆ struc… ┆ é ┆ é │\n", "│ ┆ ┆ ┆ str ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ str ┆ str │\n", "╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", "│ 278582 ┆ 1000011638 ┆ 309802205 ┆ 309802205 ┆ … ┆ null ┆ 330980220 ┆ SA28 ┆ Asso et │\n", "│ ┆ 3 ┆ 00505 ┆ 00505 ┆ ┆ ┆ 500505 ┆ ┆ orga huma │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ nitaire │\n", "│ 1394285 ┆ 1000011638 ┆ 309802205 ┆ 309802205 ┆ … ┆ null ┆ 330980220 ┆ SA28 ┆ Asso et │\n", "│ ┆ 3 ┆ 00505 ┆ 00505 ┆ ┆ ┆ 500505 ┆ ┆ orga huma │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ nitaire │\n", "│ 840759 ┆ 1000066718 ┆ 180035024 ┆ 180035024 ┆ … ┆ null ┆ 318003502 ┆ SA24 ┆ Organisme │\n", "│ ┆ 7 ┆ 02369 ┆ 02369 ┆ ┆ ┆ 402369 ┆ ┆ de │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", "│ 1677281 ┆ 1000066718 ┆ 180035024 ┆ 180035024 ┆ … ┆ null ┆ 318003502 ┆ SA24 ┆ Organisme │\n", "│ ┆ 7 ┆ 02369 ┆ 02369 ┆ ┆ ┆ 402369 ┆ ┆ de │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", "│ 1956358 ┆ 1000066718 ┆ 180035024 ┆ 180035024 ┆ … ┆ null ┆ 318003502 ┆ SA24 ┆ Organisme │\n", "│ ┆ 7 ┆ 02369 ┆ 02369 ┆ ┆ ┆ 402369 ┆ ┆ de │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", "│ 539023 ┆ 1011059216 ┆ 880859350 ┆ 880859350 ┆ … ┆ null ┆ 388085935 ┆ SA32 ┆ Fab. │\n", "│ ┆ 8 ┆ 00014 ┆ 00014 ┆ ┆ ┆ 000014 ┆ ┆ Exploit. │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Import. │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Méd. DM │\n", "│ 260720 ┆ 1011059749 ┆ 130008006 ┆ 130008006 ┆ … ┆ null ┆ 313000800 ┆ SA24 ┆ Organisme │\n", "│ ┆ 8 ┆ 00038 ┆ 00038 ┆ ┆ ┆ 600038 ┆ ┆ de │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", "│ 1376341 ┆ 1011059749 ┆ 130008006 ┆ 130008006 ┆ … ┆ null ┆ 313000800 ┆ SA24 ┆ Organisme │\n", "│ ┆ 8 ┆ 00038 ┆ 00038 ┆ ┆ ┆ 600038 ┆ ┆ de │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", "│ 1658604 ┆ 1011091034 ┆ 180035024 ┆ 180035024 ┆ … ┆ null ┆ 318003502 ┆ SA24 ┆ Organisme │\n", "│ ┆ 5 ┆ 01098 ┆ 01098 ┆ ┆ ┆ 401098 ┆ ┆ de │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", "│ 1937582 ┆ 1011091034 ┆ 180035024 ┆ 180035024 ┆ … ┆ null ┆ 318003502 ┆ SA24 ┆ Organisme │\n", "│ ┆ 5 ┆ 01098 ┆ 01098 ┆ ┆ ┆ 401098 ┆ ┆ de │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sécurité │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Sociale │\n", "└─────────┴────────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "all_columns = df.columns\n", "start_col = 'Raison sociale site'\n", "end_col = \"Libellé secteur d'activité\"\n", "start_col_index = all_columns.index(start_col)\n", "end_col_index = all_columns.index(end_col)\n", "site_info_cols = all_columns[start_col_index : end_col_index + 1]\n", "if \"Autorité d'enregistrement\" in site_info_cols:\n", " site_info_cols.remove(\"Autorité d'enregistrement\")\n", "\n", "df4 = (\n", " df\n", " .filter(pd.col('Numéro FINESS site').is_null())\n", " .filter(\n", " pd.col('Numéro SIRET site').is_not_null() | pd.col('Identifiant technique de la structure').is_not_null()\n", " )\n", " .with_columns(\n", " pd.coalesce(\n", " pd.col('Numéro SIRET site'),\n", " pd.col('Identifiant technique de la structure')\n", " ).alias('Site_Identifier')\n", " )\n", " .with_columns(\n", " pd.struct(site_info_cols).n_unique().over(['Identifiant PP', 'Site_Identifier']).alias('Site_Info_Variations_Count')\n", " )\n", " .filter(pd.col('Site_Info_Variations_Count') > 1)\n", " .sort(['Identifiant PP', 'Site_Identifier', 'index'])\n", " .select(['index', \n", " 'Identifiant PP', \n", " 'Site_Identifier', \n", " 'Numéro SIRET site', \n", " 'Identifiant technique de la structure', \n", " 'Site_Info_Variations_Count'] + site_info_cols)\n", ")\n", "df4" ] }, { "cell_type": "code", "execution_count": 9, "id": "c1fd01e419f4ccc9", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:46:07.974271Z", "start_time": "2025-08-19T22:46:07.943280Z" } }, "outputs": [], "source": [ "df4.write_csv(f\"{folder}{output_file}-Sites_Variations{output_extension}\",\n", " separator='|',\n", " quote_style=\"never\",\n", " line_terminator='\\n')\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "7838523925fc85ee", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:46:12.781888Z", "start_time": "2025-08-19T22:46:08.306776Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 2.84 s\n", "Wall time: 596 ms\n" ] }, { "data": { "text/html": [ "
\n", "shape: (4_350, 31)
indexIdentifiant PPSite_IdentifierNuméro SIRET siteIdentifiant technique de la structureSite_Info_Variations_CountRaison sociale siteEnseigne commerciale siteComplément destinataire (coord. structure)Complément point géographique (coord. structure)Numéro Voie (coord. structure)Indice répétition voie (coord. structure)Code type de voie (coord. structure)Libellé type de voie (coord. structure)Libellé Voie (coord. structure)Mention distribution (coord. structure)Bureau cedex (coord. structure)Code postal (coord. structure)Code commune (coord. structure)Libellé commune (coord. structure)Code pays (coord. structure)Libellé pays (coord. structure)Téléphone (coord. structure)Téléphone 2 (coord. structure)Télécopie (coord. structure)Adresse e-mail (coord. structure)Code Département (structure)Libellé Département (structure)Ancien identifiant de la structureCode secteur d'activitéLibellé secteur d'activité
u32strstrstrstru32strstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstrstr
126741"10100002293""05650171100115""05650171100115""R10000001502146"2"BECTON DICKINSON FRANCE"nullnullnull"11"null"R""Rue""RUE ARISTIDE BERGES"null"38801 LE PONT DE CLAIX CEDEX""38801""38317""Le Pont-de-Claix""99000""France"nullnullnullnullnullnull"305650171100115""SA32""Fab. Exploit. Import. Méd. DM"
292886"10001806768""05650171100115""05650171100115""R10000001502146"2"BECTON DICKINSON FRANCE"nullnullnull"11"null"R""Rue""RUE ARISTIDE BERGES"null"38801 LE PONT DE CLAIX CEDEX""38801""38317""Le Pont-de-Claix""99000""France"nullnullnullnullnullnull"305650171100115""SA32""Fab. Exploit. Import. Méd. DM"
473490"10104800411""05650171100115""05650171100115""R10000001502146"2"BECTON DICKINSON FRANCE"nullnullnull"11"null"R""Rue""RUE ARISTIDE BERGES"null"38801 LE PONT DE CLAIX CEDEX""38801""38317""Le Pont-de-Claix""99000""France"nullnullnullnullnullnull"305650171100115""SA32""Fab. Exploit. Import. Méd. DM"
738289"10103687157""05650171100115""05650171100115""R10000001502146"2"BECTON DICKINSON FRANCE"nullnullnull"11"null"R""Rue""RUE ARISTIDE BERGES"null"38801 LE PONT DE CLAIX CEDEX""38801""38317""Le Pont-de-Claix""99000""France"nullnullnullnullnullnull"305650171100115""SA32""Fab. Exploit. Import. Méd. DM"
850421"10001796597""05650171100115""05650171100115""R10000001502146"2"BECTON DICKINSON FRANCE"nullnullnull"11"null"R""Rue""RUE ARISTIDE BERGES"null"38801 LE PONT DE CLAIX CEDEX""38801""38317""Le Pont-de-Claix""99000""France"nullnullnullnullnullnull"305650171100115""SA32""Fab. Exploit. Import. Méd. DM"
808716"10109869403""98452619400019""98452619400019""R10100000673943"2"SELARL CENTRE DE PODOLOGIE SPO…"SELARL CENTRE DE PODOLOGIE SPO…nullnull"8"null"R""Rue""GEORGES NEGREVERGNE"null"33700 MERIGNAC""33700""33281""Mérignac""99000""France"nullnullnullnullnullnull"398452619400019""SA09""Exercice en Société"
1541009"10101293263""98452619400019""98452619400019""R10100000673943"2"SELARL CENTRE DE PODOLOGIE SPO…"SELARL CENTRE DE PODOLOGIE SPO…nullnull"8"null"R""Rue""GEORGES NEGREVERGNE"null"33700 MERIGNAC""33700""33281""Mérignac""99000""France"nullnullnullnullnullnull"398452619400019""SA09""Exercice en Société"
1900660"10108015131""98452619400019""98452619400019""R10100000673943"2"SELARL CENTRE DE PODOLOGIE SPO…"SELARL CENTRE DE PODOLOGIE SPO…nullnull"8"null"R""Rue""GEORGES NEGREVERGNE"null"33700 MERIGNAC""33700""33281""Mérignac""99000""France"nullnullnullnullnullnull"398452619400019""SA09""Exercice en Société"
1611683"10107235243""99882350430834""99882350430834""R10100000554688"2"ADECCO FRANCE""ADECCO"null"PARC VALMY PARK AVENUE BAT A 1…"8""D"nullnull"RUE JEANNE BARRET"null"21000 DIJON""21000""21231""Dijon""99000""France"nullnullnullnullnullnull"399882350430834""SA11""Entreprise d'intérim"
1630338"10108693036""99882350430834""99882350430834""R10100000413248"2"ADECCO MEDICAL"nullnullnull"8""D""R""Rue""JEANNE BARRET"null"21000 DIJON""21000""21231""Dijon""99000""France"nullnullnullnullnullnull"399882350430834""SA11""Entreprise d'intérim"
" ], "text/plain": [ "shape: (4_350, 31)\n", "┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", "│ index ┆ Identifian ┆ Site_Iden ┆ Numéro ┆ … ┆ Libellé ┆ Ancien ┆ Code ┆ Libellé │\n", "│ --- ┆ t PP ┆ tifier ┆ SIRET ┆ ┆ Départeme ┆ identifia ┆ secteur ┆ secteur │\n", "│ u32 ┆ --- ┆ --- ┆ site ┆ ┆ nt (struc ┆ nt de la ┆ d'activit ┆ d'activit │\n", "│ ┆ str ┆ str ┆ --- ┆ ┆ ture… ┆ struc… ┆ é ┆ é │\n", "│ ┆ ┆ ┆ str ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ str ┆ str │\n", "╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", "│ 126741 ┆ 1010000229 ┆ 056501711 ┆ 056501711 ┆ … ┆ null ┆ 305650171 ┆ SA32 ┆ Fab. │\n", "│ ┆ 3 ┆ 00115 ┆ 00115 ┆ ┆ ┆ 100115 ┆ ┆ Exploit. │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Import. │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Méd. DM │\n", "│ 292886 ┆ 1000180676 ┆ 056501711 ┆ 056501711 ┆ … ┆ null ┆ 305650171 ┆ SA32 ┆ Fab. │\n", "│ ┆ 8 ┆ 00115 ┆ 00115 ┆ ┆ ┆ 100115 ┆ ┆ Exploit. │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Import. │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Méd. DM │\n", "│ 473490 ┆ 1010480041 ┆ 056501711 ┆ 056501711 ┆ … ┆ null ┆ 305650171 ┆ SA32 ┆ Fab. │\n", "│ ┆ 1 ┆ 00115 ┆ 00115 ┆ ┆ ┆ 100115 ┆ ┆ Exploit. │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Import. │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Méd. DM │\n", "│ 738289 ┆ 1010368715 ┆ 056501711 ┆ 056501711 ┆ … ┆ null ┆ 305650171 ┆ SA32 ┆ Fab. │\n", "│ ┆ 7 ┆ 00115 ┆ 00115 ┆ ┆ ┆ 100115 ┆ ┆ Exploit. │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Import. │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Méd. DM │\n", "│ 850421 ┆ 1000179659 ┆ 056501711 ┆ 056501711 ┆ … ┆ null ┆ 305650171 ┆ SA32 ┆ Fab. │\n", "│ ┆ 7 ┆ 00115 ┆ 00115 ┆ ┆ ┆ 100115 ┆ ┆ Exploit. │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Import. │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Méd. DM │\n", "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", "│ 808716 ┆ 1010986940 ┆ 984526194 ┆ 984526194 ┆ … ┆ null ┆ 398452619 ┆ SA09 ┆ Exercice │\n", "│ ┆ 3 ┆ 00019 ┆ 00019 ┆ ┆ ┆ 400019 ┆ ┆ en │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Société │\n", "│ 1541009 ┆ 1010129326 ┆ 984526194 ┆ 984526194 ┆ … ┆ null ┆ 398452619 ┆ SA09 ┆ Exercice │\n", "│ ┆ 3 ┆ 00019 ┆ 00019 ┆ ┆ ┆ 400019 ┆ ┆ en │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Société │\n", "│ 1900660 ┆ 1010801513 ┆ 984526194 ┆ 984526194 ┆ … ┆ null ┆ 398452619 ┆ SA09 ┆ Exercice │\n", "│ ┆ 1 ┆ 00019 ┆ 00019 ┆ ┆ ┆ 400019 ┆ ┆ en │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Société │\n", "│ 1611683 ┆ 1010723524 ┆ 998823504 ┆ 998823504 ┆ … ┆ null ┆ 399882350 ┆ SA11 ┆ Entrepris │\n", "│ ┆ 3 ┆ 30834 ┆ 30834 ┆ ┆ ┆ 430834 ┆ ┆ e │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ d'intérim │\n", "│ 1630338 ┆ 1010869303 ┆ 998823504 ┆ 998823504 ┆ … ┆ null ┆ 399882350 ┆ SA11 ┆ Entrepris │\n", "│ ┆ 6 ┆ 30834 ┆ 30834 ┆ ┆ ┆ 430834 ┆ ┆ e │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ d'intérim │\n", "└─────────┴────────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df5 = (\n", " df\n", " .filter(pd.col('Numéro FINESS site').is_null())\n", " .filter(\n", " pd.col('Numéro SIRET site').is_not_null() | pd.col('Identifiant technique de la structure').is_not_null()\n", " )\n", " .with_columns(\n", " pd.coalesce(\n", " pd.col('Numéro SIRET site'),\n", " pd.col('Identifiant technique de la structure')\n", " ).alias('Site_Identifier')\n", " )\n", " .with_columns(\n", " pd.struct(site_info_cols).n_unique().over(['Site_Identifier']).alias('Site_Info_Variations_Count')\n", " )\n", " .filter(pd.col('Site_Info_Variations_Count') > 1)\n", " .sort(['Site_Identifier', 'index'])\n", " .select(['index', \n", " 'Identifiant PP', \n", " 'Site_Identifier', \n", " 'Numéro SIRET site', \n", " 'Identifiant technique de la structure', \n", " 'Site_Info_Variations_Count'] + site_info_cols)\n", ")\n", "df5\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "416184f32f973a71", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:46:13.670911Z", "start_time": "2025-08-19T22:46:13.655386Z" } }, "outputs": [], "source": [ "df5.write_csv(f\"{folder}{output_file}-Sites_Variations_Global{output_extension}\",\n", " separator='|',\n", " quote_style=\"never\",\n", " line_terminator='\\n')\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "84549f83ce5e92f", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:46:17.023811Z", "start_time": "2025-08-19T22:46:14.032470Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 3.77 s\n", "Wall time: 685 ms\n" ] }, { "data": { "text/html": [ "
\n", "shape: (25_105, 12)
indexIdentifiant PPSite_Identifier_GlobalNuméro FINESS siteNuméro SIRET siteIdentifiant technique de la structureActivites_CountLibellé professionLibellé savoir-faireLibellé mode exerciceLibellé rôleLibellé genre activité
u32strstrstrstrstru32strstrstrstrstr
1393660"10000017979""130786445""130786445""30247736900011""F130786445"2"Ostéopathe"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
1951128"10000017979""130786445""130786445""30247736900011""F130786445"2"Sage-Femme"null"Salarié""Fonction non définie""Activité standard de soin ou d…
556685"10000086842""860012228""860012228""13001256000038""F860012228"2"Psychologue"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
1951541"10000086842""860012228""860012228""13001256000038""F860012228"2"Psychothérapeute"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
278389"10000090869""110000023""110000023""26110002800149""F110000023"2"Sage-Femme"null"Salarié""Fonction non définie""Activité standard de soin ou d…
2225628"10111685458""670000025""670000025""26670057400012""F670000025"2"Chirurgien-Dentiste"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
275095"10111753363""13002370800014"null"13002370800014""R10100000794696"2"Infirmier"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
275096"10111753363""13002370800014"null"13002370800014""R10100000794696"2"Psychologue"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
276592"10111877634""210987632""210987632""26210006800010""F210987632"2"Psychothérapeute"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
1392260"10111877634""210987632""210987632""26210006800010""F210987632"2"Psychologue"null"Salarié""Salarié en poste fixe""Activité standard de soin ou d…
" ], "text/plain": [ "shape: (25_105, 12)\n", "┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐\n", "│ index ┆ Identifian ┆ Site_Iden ┆ Numéro ┆ … ┆ Libellé ┆ Libellé ┆ Libellé ┆ Libellé │\n", "│ --- ┆ t PP ┆ tifier_Gl ┆ FINESS ┆ ┆ savoir-fa ┆ mode ┆ rôle ┆ genre │\n", "│ u32 ┆ --- ┆ obal ┆ site ┆ ┆ ire ┆ exercice ┆ --- ┆ activité │\n", "│ ┆ str ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ str ┆ --- │\n", "│ ┆ ┆ str ┆ str ┆ ┆ str ┆ str ┆ ┆ str │\n", "╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡\n", "│ 1393660 ┆ 1000001797 ┆ 130786445 ┆ 130786445 ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", "│ ┆ 9 ┆ ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", "│ 1951128 ┆ 1000001797 ┆ 130786445 ┆ 130786445 ┆ … ┆ null ┆ Salarié ┆ Fonction ┆ Activité │\n", "│ ┆ 9 ┆ ┆ ┆ ┆ ┆ ┆ non ┆ standard │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ définie ┆ de soin │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", "│ 556685 ┆ 1000008684 ┆ 860012228 ┆ 860012228 ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", "│ ┆ 2 ┆ ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", "│ 1951541 ┆ 1000008684 ┆ 860012228 ┆ 860012228 ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", "│ ┆ 2 ┆ ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", "│ 278389 ┆ 1000009086 ┆ 110000023 ┆ 110000023 ┆ … ┆ null ┆ Salarié ┆ Fonction ┆ Activité │\n", "│ ┆ 9 ┆ ┆ ┆ ┆ ┆ ┆ non ┆ standard │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ définie ┆ de soin │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", "│ 2225628 ┆ 1011168545 ┆ 670000025 ┆ 670000025 ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", "│ ┆ 8 ┆ ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", "│ 275095 ┆ 1011175336 ┆ 130023708 ┆ null ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", "│ ┆ 3 ┆ 00014 ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", "│ 275096 ┆ 1011175336 ┆ 130023708 ┆ null ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", "│ ┆ 3 ┆ 00014 ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", "│ 276592 ┆ 1011187763 ┆ 210987632 ┆ 210987632 ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", "│ ┆ 4 ┆ ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", "│ 1392260 ┆ 1011187763 ┆ 210987632 ┆ 210987632 ┆ … ┆ null ┆ Salarié ┆ Salarié ┆ Activité │\n", "│ ┆ 4 ┆ ┆ ┆ ┆ ┆ ┆ en poste ┆ standard │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ fixe ┆ de soin │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ou d… │\n", "└─────────┴────────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df6 = (\n", " df\n", " .with_columns(\n", " pd.coalesce(\n", " pd.col('Numéro FINESS site'),\n", " pd.col('Numéro SIRET site'),\n", " pd.col('Identifiant technique de la structure')\n", " ).alias('Site_Identifier_Global')\n", " )\n", " .filter(pd.col('Site_Identifier_Global').is_not_null())\n", " .with_columns(\n", " pd.struct([\n", " \"Libellé profession\",\n", " \"Libellé savoir-faire\",\n", " \"Libellé mode exercice\",\n", " \"Libellé rôle\",\n", " \"Libellé genre activité\"\n", " ]).n_unique().over([\"Identifiant PP\", \"Site_Identifier_Global\"]).alias(\"Activites_Count\")\n", " )\n", " .filter(pd.col(\"Activites_Count\") > 1)\n", " .sort([\"Identifiant PP\", \"Site_Identifier_Global\", \"index\"])\n", " .select([\n", " \"index\",\n", " \"Identifiant PP\",\n", " \"Site_Identifier_Global\",\n", " \"Numéro FINESS site\",\n", " \"Numéro SIRET site\",\n", " \"Identifiant technique de la structure\",\n", " \"Activites_Count\",\n", " \"Libellé profession\",\n", " \"Libellé savoir-faire\",\n", " \"Libellé mode exercice\",\n", " \"Libellé rôle\",\n", " \"Libellé genre activité\"\n", " ])\n", ")\n", "df6\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "6f7025a7c08b54b4", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:46:17.851427Z", "start_time": "2025-08-19T22:46:17.796168Z" } }, "outputs": [], "source": [ "df6.write_csv(f\"{folder}{output_file}-Multiple_Activities_Per_Site{output_extension}\",\n", " separator='|',\n", " quote_style=\"never\",\n", " line_terminator='\\n')\n" ] }, { "cell_type": "code", "execution_count": 14, "id": "b18d9ba71ba63d9d", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:46:19.535052Z", "start_time": "2025-08-19T22:46:18.015194Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: total: 2.53 s\n", "Wall time: 483 ms\n" ] }, { "data": { "text/html": [ "
\n", "shape: (91_163, 5)
indexIdentifiant PPProfession_CountLibellé professionLibellé savoir-faire
u32stru32strstr
78"10000013150"2"Médecin""Psychiatrie"
277918"10000013150"2"Psychothérapeute"null
1393660"10000017979"2"Ostéopathe"null
1951128"10000017979"2"Sage-Femme"null
835571"10000029966"2"Sage-Femme"null
1670279"10111847264"2"Psychologue"null
1391880"10111849799"2"Psychothérapeute"null
1670314"10111849799"2"Psychologue"null
276592"10111877634"2"Psychothérapeute"null
1392260"10111877634"2"Psychologue"null
" ], "text/plain": [ "shape: (91_163, 5)\n", "┌─────────┬────────────────┬──────────────────┬────────────────────┬──────────────────────┐\n", "│ index ┆ Identifiant PP ┆ Profession_Count ┆ Libellé profession ┆ Libellé savoir-faire │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ str ┆ u32 ┆ str ┆ str │\n", "╞═════════╪════════════════╪══════════════════╪════════════════════╪══════════════════════╡\n", "│ 78 ┆ 10000013150 ┆ 2 ┆ Médecin ┆ Psychiatrie │\n", "│ 277918 ┆ 10000013150 ┆ 2 ┆ Psychothérapeute ┆ null │\n", "│ 1393660 ┆ 10000017979 ┆ 2 ┆ Ostéopathe ┆ null │\n", "│ 1951128 ┆ 10000017979 ┆ 2 ┆ Sage-Femme ┆ null │\n", "│ 835571 ┆ 10000029966 ┆ 2 ┆ Sage-Femme ┆ null │\n", "│ … ┆ … ┆ … ┆ … ┆ … │\n", "│ 1670279 ┆ 10111847264 ┆ 2 ┆ Psychologue ┆ null │\n", "│ 1391880 ┆ 10111849799 ┆ 2 ┆ Psychothérapeute ┆ null │\n", "│ 1670314 ┆ 10111849799 ┆ 2 ┆ Psychologue ┆ null │\n", "│ 276592 ┆ 10111877634 ┆ 2 ┆ Psychothérapeute ┆ null │\n", "│ 1392260 ┆ 10111877634 ┆ 2 ┆ Psychologue ┆ null │\n", "└─────────┴────────────────┴──────────────────┴────────────────────┴──────────────────────┘" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "df7 = (\n", " df\n", " .with_columns(\n", " pd.col(\"Libellé profession\").n_unique().over(\"Identifiant PP\").alias(\"Profession_Count\")\n", " )\n", " .filter(pd.col(\"Profession_Count\") > 1)\n", " .sort([\"Identifiant PP\", \"index\"])\n", " .select([\n", " \"index\",\n", " \"Identifiant PP\",\n", " \"Profession_Count\",\n", " \"Libellé profession\",\n", " \"Libellé savoir-faire\"\n", " ])\n", ")\n", "df7\n" ] }, { "cell_type": "code", "execution_count": 15, "id": "85be468fd3f461d1", "metadata": { "ExecuteTime": { "end_time": "2025-08-19T22:46:19.893214Z", "start_time": "2025-08-19T22:46:19.851765Z" } }, "outputs": [], "source": [ "df7.write_csv(f\"{folder}{output_file}-Multiple_Professions{output_extension}\",\n", " separator='|',\n", " quote_style=\"never\",\n", " line_terminator='\\n')\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.3" } }, "nbformat": 4, "nbformat_minor": 5 }