Migration to Polars

This commit is contained in:
2026-03-05 12:08:51 +00:00
parent 0ae48d63f2
commit 6e84919fcc
2 changed files with 101 additions and 35 deletions

View File

@@ -1,12 +1,10 @@
import argparse
import csv
import math
import sys
from os import path
from shutil import copyfileobj
from zipfile import ZipFile, is_zipfile
import numpy as np
import pandas as pd
import polars as pl
import requests
from tqdm import tqdm # could use from tqdm.gui import tqdm
from tqdm.utils import CallbackIOWrapper
@@ -15,43 +13,87 @@ from urllib3 import disable_warnings
import questionary
# Colonnes à effacer (données personnelles ou inutiles pour l'import)
COLUMNS_TO_CLEAR = [
"Type d'identifiant PP",
"Identification nationale PP",
"Libellé civilité d'exercice",
"Code civilité",
"Libellé civilité",
"Code profession",
"Code catégorie professionnelle",
"Libellé catégorie professionnelle",
"Code type savoir-faire",
"Libellé type savoir-faire",
"Code savoir-faire",
"Code mode exercice",
"Libellé mode exercice",
"Numéro SIRET site",
"Numéro SIREN site",
"Numéro FINESS site",
"Numéro FINESS établissement juridique",
"Identifiant technique de la structure",
"Raison sociale site",
"Enseigne commerciale site",
"Complément destinataire (coord. structure)",
"Complément point géographique (coord. structure)",
"Code type de voie (coord. structure)",
"Code postal (coord. structure)",
"Code commune (coord. structure)",
"Libellé commune (coord. structure)",
"Code pays (coord. structure)",
"Libellé pays (coord. structure)",
"Téléphone 2 (coord. structure)",
"Télécopie (coord. structure)",
"Adresse e-mail (coord. structure)",
"Code Département (structure)",
"Libellé Département (structure)",
"Ancien identifiant de la structure",
"Autorité d'enregistrement",
"Code secteur d'activité",
"Libellé secteur d'activité",
"Code section tableau pharmaciens",
"Libellé section tableau pharmaciens",
"Code rôle",
"Libellé rôle",
"Code genre activité",
"Libellé genre activité",
]
def process_professionals_table(xls_file, txt_file, output_file):
# Load Excel Dataframes
xls = pd.read_excel(xls_file, sheet_name=None, dtype=str,
na_values='', keep_default_na=False)
professions = xls['F_Professions']['Professions'].tolist()
# Chargement des onglets Excel
professions = pl.read_excel(xls_file, sheet_name='F_Professions').get_column('Professions').to_list()
df_append = pl.read_excel(xls_file, sheet_name='F_Append_Update', infer_schema_length=0)
df_etrangers = pl.read_excel(xls_file, sheet_name='F_Etrangers', infer_schema_length=0)
df_fake = pl.read_excel(xls_file, sheet_name='F_Fake', infer_schema_length=0)
df_sophrolo = pl.read_excel(xls_file, sheet_name='F_Sophrologues', infer_schema_length=0)
# CSV Progressbar initialisation
estimated_total_rows = sum(1 for _ in open(txt_file, 'rb')) - 1
chunk_size = 20000
# Colonnes à effacer présentes dans le fichier (filtre défensif)
schema = pl.scan_csv(txt_file, separator='|', infer_schema=False, quote_char=None).collect_schema()
cols_to_clear = [c for c in COLUMNS_TO_CLEAR if c in schema]
# Iterating over CSV file
columns_to_clean = np.r_[0, 2, 4:7, 9, 11:16, 17:28, 30, 35:40, 41:56]
with tqdm(total=estimated_total_rows, desc=f'Writing to {path.basename(output_file)}',
leave=True, unit="Ln") as bar:
for i, df in enumerate(pd.read_csv(txt_file, sep='|', doublequote=False, quoting=csv.QUOTE_NONE,
dtype=str, na_values='', keep_default_na=False, chunksize=chunk_size)):
n_rows = df.shape[0]
df.iloc[:, columns_to_clean] = ''
df = df[df['Libellé profession'].isin(professions)]
# Lecture lazy du CSV + filtrage + effacement des colonnes sensibles
with tqdm(desc=f'Processing {path.basename(txt_file)}', bar_format='{desc}... {elapsed}',
leave=True) as spinner:
result = (
pl.scan_csv(txt_file, separator='|', infer_schema=False, quote_char=None)
.filter(pl.col('Libellé profession').is_in(professions))
.with_columns([pl.lit('').alias(c) for c in cols_to_clear])
.collect()
)
spinner.set_description(f'Processing {path.basename(txt_file)}{len(result)} lignes')
if i == 0:
df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
lineterminator='\n')
else:
df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
lineterminator='\n', header=False, mode='a')
# Ajout des données complémentaires des onglets Excel
extra = (
pl.concat([df_append, df_etrangers, df_fake, df_sophrolo], how='diagonal_relaxed')
.filter(pl.col('Libellé profession').is_in(professions))
.with_columns([pl.lit('').alias(c) for c in cols_to_clear if c in df_append.columns])
)
bar.update(n_rows)
bar.close()
# Appending Other xls tabs
df = pd.concat([df[:0], xls['F_Append_Update'], xls['F_Etrangers'],
xls['F_Fake'], xls['F_Sophrologues']], ignore_index=True)
df.iloc[:, columns_to_clean] = ''
df = df[df['Libellé profession'].isin(professions)]
df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
lineterminator='\n', header=False, mode='a')
final = pl.concat([result, extra], how='diagonal_relaxed')
final.write_csv(output_file, separator='|', quote_style='never', line_terminator='\n')
print(f"Written: {path.basename(output_file)} ({len(final)} lignes)")
def download_file(url: str, filename: str = False) -> object: