Migration to Polars
This commit is contained in:
@@ -1,12 +1,10 @@
|
||||
import argparse
|
||||
import csv
|
||||
import math
|
||||
import sys
|
||||
from os import path
|
||||
from shutil import copyfileobj
|
||||
from zipfile import ZipFile, is_zipfile
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
import requests
|
||||
from tqdm import tqdm # could use from tqdm.gui import tqdm
|
||||
from tqdm.utils import CallbackIOWrapper
|
||||
@@ -15,43 +13,87 @@ from urllib3 import disable_warnings
|
||||
import questionary
|
||||
|
||||
|
||||
# Colonnes à effacer (données personnelles ou inutiles pour l'import)
|
||||
COLUMNS_TO_CLEAR = [
|
||||
"Type d'identifiant PP",
|
||||
"Identification nationale PP",
|
||||
"Libellé civilité d'exercice",
|
||||
"Code civilité",
|
||||
"Libellé civilité",
|
||||
"Code profession",
|
||||
"Code catégorie professionnelle",
|
||||
"Libellé catégorie professionnelle",
|
||||
"Code type savoir-faire",
|
||||
"Libellé type savoir-faire",
|
||||
"Code savoir-faire",
|
||||
"Code mode exercice",
|
||||
"Libellé mode exercice",
|
||||
"Numéro SIRET site",
|
||||
"Numéro SIREN site",
|
||||
"Numéro FINESS site",
|
||||
"Numéro FINESS établissement juridique",
|
||||
"Identifiant technique de la structure",
|
||||
"Raison sociale site",
|
||||
"Enseigne commerciale site",
|
||||
"Complément destinataire (coord. structure)",
|
||||
"Complément point géographique (coord. structure)",
|
||||
"Code type de voie (coord. structure)",
|
||||
"Code postal (coord. structure)",
|
||||
"Code commune (coord. structure)",
|
||||
"Libellé commune (coord. structure)",
|
||||
"Code pays (coord. structure)",
|
||||
"Libellé pays (coord. structure)",
|
||||
"Téléphone 2 (coord. structure)",
|
||||
"Télécopie (coord. structure)",
|
||||
"Adresse e-mail (coord. structure)",
|
||||
"Code Département (structure)",
|
||||
"Libellé Département (structure)",
|
||||
"Ancien identifiant de la structure",
|
||||
"Autorité d'enregistrement",
|
||||
"Code secteur d'activité",
|
||||
"Libellé secteur d'activité",
|
||||
"Code section tableau pharmaciens",
|
||||
"Libellé section tableau pharmaciens",
|
||||
"Code rôle",
|
||||
"Libellé rôle",
|
||||
"Code genre activité",
|
||||
"Libellé genre activité",
|
||||
]
|
||||
|
||||
|
||||
def process_professionals_table(xls_file, txt_file, output_file):
|
||||
# Load Excel Dataframes
|
||||
xls = pd.read_excel(xls_file, sheet_name=None, dtype=str,
|
||||
na_values='', keep_default_na=False)
|
||||
professions = xls['F_Professions']['Professions'].tolist()
|
||||
# Chargement des onglets Excel
|
||||
professions = pl.read_excel(xls_file, sheet_name='F_Professions').get_column('Professions').to_list()
|
||||
df_append = pl.read_excel(xls_file, sheet_name='F_Append_Update', infer_schema_length=0)
|
||||
df_etrangers = pl.read_excel(xls_file, sheet_name='F_Etrangers', infer_schema_length=0)
|
||||
df_fake = pl.read_excel(xls_file, sheet_name='F_Fake', infer_schema_length=0)
|
||||
df_sophrolo = pl.read_excel(xls_file, sheet_name='F_Sophrologues', infer_schema_length=0)
|
||||
|
||||
# CSV Progressbar initialisation
|
||||
estimated_total_rows = sum(1 for _ in open(txt_file, 'rb')) - 1
|
||||
chunk_size = 20000
|
||||
# Colonnes à effacer présentes dans le fichier (filtre défensif)
|
||||
schema = pl.scan_csv(txt_file, separator='|', infer_schema=False, quote_char=None).collect_schema()
|
||||
cols_to_clear = [c for c in COLUMNS_TO_CLEAR if c in schema]
|
||||
|
||||
# Iterating over CSV file
|
||||
columns_to_clean = np.r_[0, 2, 4:7, 9, 11:16, 17:28, 30, 35:40, 41:56]
|
||||
with tqdm(total=estimated_total_rows, desc=f'Writing to {path.basename(output_file)}',
|
||||
leave=True, unit="Ln") as bar:
|
||||
for i, df in enumerate(pd.read_csv(txt_file, sep='|', doublequote=False, quoting=csv.QUOTE_NONE,
|
||||
dtype=str, na_values='', keep_default_na=False, chunksize=chunk_size)):
|
||||
n_rows = df.shape[0]
|
||||
df.iloc[:, columns_to_clean] = ''
|
||||
df = df[df['Libellé profession'].isin(professions)]
|
||||
# Lecture lazy du CSV + filtrage + effacement des colonnes sensibles
|
||||
with tqdm(desc=f'Processing {path.basename(txt_file)}', bar_format='{desc}... {elapsed}',
|
||||
leave=True) as spinner:
|
||||
result = (
|
||||
pl.scan_csv(txt_file, separator='|', infer_schema=False, quote_char=None)
|
||||
.filter(pl.col('Libellé profession').is_in(professions))
|
||||
.with_columns([pl.lit('').alias(c) for c in cols_to_clear])
|
||||
.collect()
|
||||
)
|
||||
spinner.set_description(f'Processing {path.basename(txt_file)} — {len(result)} lignes')
|
||||
|
||||
if i == 0:
|
||||
df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
|
||||
lineterminator='\n')
|
||||
else:
|
||||
df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
|
||||
lineterminator='\n', header=False, mode='a')
|
||||
# Ajout des données complémentaires des onglets Excel
|
||||
extra = (
|
||||
pl.concat([df_append, df_etrangers, df_fake, df_sophrolo], how='diagonal_relaxed')
|
||||
.filter(pl.col('Libellé profession').is_in(professions))
|
||||
.with_columns([pl.lit('').alias(c) for c in cols_to_clear if c in df_append.columns])
|
||||
)
|
||||
|
||||
bar.update(n_rows)
|
||||
bar.close()
|
||||
|
||||
# Appending Other xls tabs
|
||||
df = pd.concat([df[:0], xls['F_Append_Update'], xls['F_Etrangers'],
|
||||
xls['F_Fake'], xls['F_Sophrologues']], ignore_index=True)
|
||||
df.iloc[:, columns_to_clean] = ''
|
||||
df = df[df['Libellé profession'].isin(professions)]
|
||||
df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
|
||||
lineterminator='\n', header=False, mode='a')
|
||||
final = pl.concat([result, extra], how='diagonal_relaxed')
|
||||
final.write_csv(output_file, separator='|', quote_style='never', line_terminator='\n')
|
||||
print(f"Written: {path.basename(output_file)} ({len(final)} lignes)")
|
||||
|
||||
|
||||
def download_file(url: str, filename: str = False) -> object:
|
||||
|
||||
Reference in New Issue
Block a user