Migration to Polars

This commit is contained in:
2026-03-05 12:08:51 +00:00
parent 0ae48d63f2
commit 6e84919fcc
2 changed files with 101 additions and 35 deletions

24
.gitignore vendored
View File

@@ -1,3 +1,27 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
.env
venv/
ENV/
*.rar *.rar
*.zip *.zip
*.txt *.txt

View File

@@ -1,12 +1,10 @@
import argparse import argparse
import csv
import math import math
import sys import sys
from os import path from os import path
from shutil import copyfileobj from shutil import copyfileobj
from zipfile import ZipFile, is_zipfile from zipfile import ZipFile, is_zipfile
import numpy as np import polars as pl
import pandas as pd
import requests import requests
from tqdm import tqdm # could use from tqdm.gui import tqdm from tqdm import tqdm # could use from tqdm.gui import tqdm
from tqdm.utils import CallbackIOWrapper from tqdm.utils import CallbackIOWrapper
@@ -15,43 +13,87 @@ from urllib3 import disable_warnings
import questionary import questionary
# Colonnes à effacer (données personnelles ou inutiles pour l'import)
COLUMNS_TO_CLEAR = [
"Type d'identifiant PP",
"Identification nationale PP",
"Libellé civilité d'exercice",
"Code civilité",
"Libellé civilité",
"Code profession",
"Code catégorie professionnelle",
"Libellé catégorie professionnelle",
"Code type savoir-faire",
"Libellé type savoir-faire",
"Code savoir-faire",
"Code mode exercice",
"Libellé mode exercice",
"Numéro SIRET site",
"Numéro SIREN site",
"Numéro FINESS site",
"Numéro FINESS établissement juridique",
"Identifiant technique de la structure",
"Raison sociale site",
"Enseigne commerciale site",
"Complément destinataire (coord. structure)",
"Complément point géographique (coord. structure)",
"Code type de voie (coord. structure)",
"Code postal (coord. structure)",
"Code commune (coord. structure)",
"Libellé commune (coord. structure)",
"Code pays (coord. structure)",
"Libellé pays (coord. structure)",
"Téléphone 2 (coord. structure)",
"Télécopie (coord. structure)",
"Adresse e-mail (coord. structure)",
"Code Département (structure)",
"Libellé Département (structure)",
"Ancien identifiant de la structure",
"Autorité d'enregistrement",
"Code secteur d'activité",
"Libellé secteur d'activité",
"Code section tableau pharmaciens",
"Libellé section tableau pharmaciens",
"Code rôle",
"Libellé rôle",
"Code genre activité",
"Libellé genre activité",
]
def process_professionals_table(xls_file, txt_file, output_file): def process_professionals_table(xls_file, txt_file, output_file):
# Load Excel Dataframes # Chargement des onglets Excel
xls = pd.read_excel(xls_file, sheet_name=None, dtype=str, professions = pl.read_excel(xls_file, sheet_name='F_Professions').get_column('Professions').to_list()
na_values='', keep_default_na=False) df_append = pl.read_excel(xls_file, sheet_name='F_Append_Update', infer_schema_length=0)
professions = xls['F_Professions']['Professions'].tolist() df_etrangers = pl.read_excel(xls_file, sheet_name='F_Etrangers', infer_schema_length=0)
df_fake = pl.read_excel(xls_file, sheet_name='F_Fake', infer_schema_length=0)
df_sophrolo = pl.read_excel(xls_file, sheet_name='F_Sophrologues', infer_schema_length=0)
# CSV Progressbar initialisation # Colonnes à effacer présentes dans le fichier (filtre défensif)
estimated_total_rows = sum(1 for _ in open(txt_file, 'rb')) - 1 schema = pl.scan_csv(txt_file, separator='|', infer_schema=False, quote_char=None).collect_schema()
chunk_size = 20000 cols_to_clear = [c for c in COLUMNS_TO_CLEAR if c in schema]
# Iterating over CSV file # Lecture lazy du CSV + filtrage + effacement des colonnes sensibles
columns_to_clean = np.r_[0, 2, 4:7, 9, 11:16, 17:28, 30, 35:40, 41:56] with tqdm(desc=f'Processing {path.basename(txt_file)}', bar_format='{desc}... {elapsed}',
with tqdm(total=estimated_total_rows, desc=f'Writing to {path.basename(output_file)}', leave=True) as spinner:
leave=True, unit="Ln") as bar: result = (
for i, df in enumerate(pd.read_csv(txt_file, sep='|', doublequote=False, quoting=csv.QUOTE_NONE, pl.scan_csv(txt_file, separator='|', infer_schema=False, quote_char=None)
dtype=str, na_values='', keep_default_na=False, chunksize=chunk_size)): .filter(pl.col('Libellé profession').is_in(professions))
n_rows = df.shape[0] .with_columns([pl.lit('').alias(c) for c in cols_to_clear])
df.iloc[:, columns_to_clean] = '' .collect()
df = df[df['Libellé profession'].isin(professions)] )
spinner.set_description(f'Processing {path.basename(txt_file)}{len(result)} lignes')
if i == 0: # Ajout des données complémentaires des onglets Excel
df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE, extra = (
lineterminator='\n') pl.concat([df_append, df_etrangers, df_fake, df_sophrolo], how='diagonal_relaxed')
else: .filter(pl.col('Libellé profession').is_in(professions))
df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE, .with_columns([pl.lit('').alias(c) for c in cols_to_clear if c in df_append.columns])
lineterminator='\n', header=False, mode='a') )
bar.update(n_rows) final = pl.concat([result, extra], how='diagonal_relaxed')
bar.close() final.write_csv(output_file, separator='|', quote_style='never', line_terminator='\n')
print(f"Written: {path.basename(output_file)} ({len(final)} lignes)")
# Appending Other xls tabs
df = pd.concat([df[:0], xls['F_Append_Update'], xls['F_Etrangers'],
xls['F_Fake'], xls['F_Sophrologues']], ignore_index=True)
df.iloc[:, columns_to_clean] = ''
df = df[df['Libellé profession'].isin(professions)]
df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
lineterminator='\n', header=False, mode='a')
def download_file(url: str, filename: str = False) -> object: def download_file(url: str, filename: str = False) -> object: