diff --git a/.gitignore b/.gitignore index 220c29d..0869605 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,27 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +.env +venv/ +ENV/ *.rar *.zip *.txt diff --git a/prepareProfessionalsTable.py b/prepareProfessionalsTable.py index 3e7fa88..fac9812 100644 --- a/prepareProfessionalsTable.py +++ b/prepareProfessionalsTable.py @@ -1,12 +1,10 @@ import argparse -import csv import math import sys from os import path from shutil import copyfileobj from zipfile import ZipFile, is_zipfile -import numpy as np -import pandas as pd +import polars as pl import requests from tqdm import tqdm # could use from tqdm.gui import tqdm from tqdm.utils import CallbackIOWrapper @@ -15,43 +13,87 @@ from urllib3 import disable_warnings import questionary +# Colonnes à effacer (données personnelles ou inutiles pour l'import) +COLUMNS_TO_CLEAR = [ + "Type d'identifiant PP", + "Identification nationale PP", + "Libellé civilité d'exercice", + "Code civilité", + "Libellé civilité", + "Code profession", + "Code catégorie professionnelle", + "Libellé catégorie professionnelle", + "Code type savoir-faire", + "Libellé type savoir-faire", + "Code savoir-faire", + "Code mode exercice", + "Libellé mode exercice", + "Numéro SIRET site", + "Numéro SIREN site", + "Numéro FINESS site", + "Numéro FINESS établissement juridique", + "Identifiant technique de la structure", + "Raison sociale site", + "Enseigne commerciale site", + "Complément destinataire (coord. structure)", + "Complément point géographique (coord. structure)", + "Code type de voie (coord. structure)", + "Code postal (coord. structure)", + "Code commune (coord. structure)", + "Libellé commune (coord. structure)", + "Code pays (coord. structure)", + "Libellé pays (coord. structure)", + "Téléphone 2 (coord. structure)", + "Télécopie (coord. structure)", + "Adresse e-mail (coord. structure)", + "Code Département (structure)", + "Libellé Département (structure)", + "Ancien identifiant de la structure", + "Autorité d'enregistrement", + "Code secteur d'activité", + "Libellé secteur d'activité", + "Code section tableau pharmaciens", + "Libellé section tableau pharmaciens", + "Code rôle", + "Libellé rôle", + "Code genre activité", + "Libellé genre activité", +] + + def process_professionals_table(xls_file, txt_file, output_file): - # Load Excel Dataframes - xls = pd.read_excel(xls_file, sheet_name=None, dtype=str, - na_values='', keep_default_na=False) - professions = xls['F_Professions']['Professions'].tolist() + # Chargement des onglets Excel + professions = pl.read_excel(xls_file, sheet_name='F_Professions').get_column('Professions').to_list() + df_append = pl.read_excel(xls_file, sheet_name='F_Append_Update', infer_schema_length=0) + df_etrangers = pl.read_excel(xls_file, sheet_name='F_Etrangers', infer_schema_length=0) + df_fake = pl.read_excel(xls_file, sheet_name='F_Fake', infer_schema_length=0) + df_sophrolo = pl.read_excel(xls_file, sheet_name='F_Sophrologues', infer_schema_length=0) - # CSV Progressbar initialisation - estimated_total_rows = sum(1 for _ in open(txt_file, 'rb')) - 1 - chunk_size = 20000 + # Colonnes à effacer présentes dans le fichier (filtre défensif) + schema = pl.scan_csv(txt_file, separator='|', infer_schema=False, quote_char=None).collect_schema() + cols_to_clear = [c for c in COLUMNS_TO_CLEAR if c in schema] - # Iterating over CSV file - columns_to_clean = np.r_[0, 2, 4:7, 9, 11:16, 17:28, 30, 35:40, 41:56] - with tqdm(total=estimated_total_rows, desc=f'Writing to {path.basename(output_file)}', - leave=True, unit="Ln") as bar: - for i, df in enumerate(pd.read_csv(txt_file, sep='|', doublequote=False, quoting=csv.QUOTE_NONE, - dtype=str, na_values='', keep_default_na=False, chunksize=chunk_size)): - n_rows = df.shape[0] - df.iloc[:, columns_to_clean] = '' - df = df[df['Libellé profession'].isin(professions)] + # Lecture lazy du CSV + filtrage + effacement des colonnes sensibles + with tqdm(desc=f'Processing {path.basename(txt_file)}', bar_format='{desc}... {elapsed}', + leave=True) as spinner: + result = ( + pl.scan_csv(txt_file, separator='|', infer_schema=False, quote_char=None) + .filter(pl.col('Libellé profession').is_in(professions)) + .with_columns([pl.lit('').alias(c) for c in cols_to_clear]) + .collect() + ) + spinner.set_description(f'Processing {path.basename(txt_file)} — {len(result)} lignes') - if i == 0: - df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE, - lineterminator='\n') - else: - df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE, - lineterminator='\n', header=False, mode='a') + # Ajout des données complémentaires des onglets Excel + extra = ( + pl.concat([df_append, df_etrangers, df_fake, df_sophrolo], how='diagonal_relaxed') + .filter(pl.col('Libellé profession').is_in(professions)) + .with_columns([pl.lit('').alias(c) for c in cols_to_clear if c in df_append.columns]) + ) - bar.update(n_rows) - bar.close() - - # Appending Other xls tabs - df = pd.concat([df[:0], xls['F_Append_Update'], xls['F_Etrangers'], - xls['F_Fake'], xls['F_Sophrologues']], ignore_index=True) - df.iloc[:, columns_to_clean] = '' - df = df[df['Libellé profession'].isin(professions)] - df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE, - lineterminator='\n', header=False, mode='a') + final = pl.concat([result, extra], how='diagonal_relaxed') + final.write_csv(output_file, separator='|', quote_style='never', line_terminator='\n') + print(f"Written: {path.basename(output_file)} ({len(final)} lignes)") def download_file(url: str, filename: str = False) -> object: