Migration to Polars
This commit is contained in:
24
.gitignore
vendored
24
.gitignore
vendored
@@ -1,3 +1,27 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
.env
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
*.rar
|
*.rar
|
||||||
*.zip
|
*.zip
|
||||||
*.txt
|
*.txt
|
||||||
|
|||||||
@@ -1,12 +1,10 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import csv
|
|
||||||
import math
|
import math
|
||||||
import sys
|
import sys
|
||||||
from os import path
|
from os import path
|
||||||
from shutil import copyfileobj
|
from shutil import copyfileobj
|
||||||
from zipfile import ZipFile, is_zipfile
|
from zipfile import ZipFile, is_zipfile
|
||||||
import numpy as np
|
import polars as pl
|
||||||
import pandas as pd
|
|
||||||
import requests
|
import requests
|
||||||
from tqdm import tqdm # could use from tqdm.gui import tqdm
|
from tqdm import tqdm # could use from tqdm.gui import tqdm
|
||||||
from tqdm.utils import CallbackIOWrapper
|
from tqdm.utils import CallbackIOWrapper
|
||||||
@@ -15,43 +13,87 @@ from urllib3 import disable_warnings
|
|||||||
import questionary
|
import questionary
|
||||||
|
|
||||||
|
|
||||||
|
# Colonnes à effacer (données personnelles ou inutiles pour l'import)
|
||||||
|
COLUMNS_TO_CLEAR = [
|
||||||
|
"Type d'identifiant PP",
|
||||||
|
"Identification nationale PP",
|
||||||
|
"Libellé civilité d'exercice",
|
||||||
|
"Code civilité",
|
||||||
|
"Libellé civilité",
|
||||||
|
"Code profession",
|
||||||
|
"Code catégorie professionnelle",
|
||||||
|
"Libellé catégorie professionnelle",
|
||||||
|
"Code type savoir-faire",
|
||||||
|
"Libellé type savoir-faire",
|
||||||
|
"Code savoir-faire",
|
||||||
|
"Code mode exercice",
|
||||||
|
"Libellé mode exercice",
|
||||||
|
"Numéro SIRET site",
|
||||||
|
"Numéro SIREN site",
|
||||||
|
"Numéro FINESS site",
|
||||||
|
"Numéro FINESS établissement juridique",
|
||||||
|
"Identifiant technique de la structure",
|
||||||
|
"Raison sociale site",
|
||||||
|
"Enseigne commerciale site",
|
||||||
|
"Complément destinataire (coord. structure)",
|
||||||
|
"Complément point géographique (coord. structure)",
|
||||||
|
"Code type de voie (coord. structure)",
|
||||||
|
"Code postal (coord. structure)",
|
||||||
|
"Code commune (coord. structure)",
|
||||||
|
"Libellé commune (coord. structure)",
|
||||||
|
"Code pays (coord. structure)",
|
||||||
|
"Libellé pays (coord. structure)",
|
||||||
|
"Téléphone 2 (coord. structure)",
|
||||||
|
"Télécopie (coord. structure)",
|
||||||
|
"Adresse e-mail (coord. structure)",
|
||||||
|
"Code Département (structure)",
|
||||||
|
"Libellé Département (structure)",
|
||||||
|
"Ancien identifiant de la structure",
|
||||||
|
"Autorité d'enregistrement",
|
||||||
|
"Code secteur d'activité",
|
||||||
|
"Libellé secteur d'activité",
|
||||||
|
"Code section tableau pharmaciens",
|
||||||
|
"Libellé section tableau pharmaciens",
|
||||||
|
"Code rôle",
|
||||||
|
"Libellé rôle",
|
||||||
|
"Code genre activité",
|
||||||
|
"Libellé genre activité",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def process_professionals_table(xls_file, txt_file, output_file):
|
def process_professionals_table(xls_file, txt_file, output_file):
|
||||||
# Load Excel Dataframes
|
# Chargement des onglets Excel
|
||||||
xls = pd.read_excel(xls_file, sheet_name=None, dtype=str,
|
professions = pl.read_excel(xls_file, sheet_name='F_Professions').get_column('Professions').to_list()
|
||||||
na_values='', keep_default_na=False)
|
df_append = pl.read_excel(xls_file, sheet_name='F_Append_Update', infer_schema_length=0)
|
||||||
professions = xls['F_Professions']['Professions'].tolist()
|
df_etrangers = pl.read_excel(xls_file, sheet_name='F_Etrangers', infer_schema_length=0)
|
||||||
|
df_fake = pl.read_excel(xls_file, sheet_name='F_Fake', infer_schema_length=0)
|
||||||
|
df_sophrolo = pl.read_excel(xls_file, sheet_name='F_Sophrologues', infer_schema_length=0)
|
||||||
|
|
||||||
# CSV Progressbar initialisation
|
# Colonnes à effacer présentes dans le fichier (filtre défensif)
|
||||||
estimated_total_rows = sum(1 for _ in open(txt_file, 'rb')) - 1
|
schema = pl.scan_csv(txt_file, separator='|', infer_schema=False, quote_char=None).collect_schema()
|
||||||
chunk_size = 20000
|
cols_to_clear = [c for c in COLUMNS_TO_CLEAR if c in schema]
|
||||||
|
|
||||||
# Iterating over CSV file
|
# Lecture lazy du CSV + filtrage + effacement des colonnes sensibles
|
||||||
columns_to_clean = np.r_[0, 2, 4:7, 9, 11:16, 17:28, 30, 35:40, 41:56]
|
with tqdm(desc=f'Processing {path.basename(txt_file)}', bar_format='{desc}... {elapsed}',
|
||||||
with tqdm(total=estimated_total_rows, desc=f'Writing to {path.basename(output_file)}',
|
leave=True) as spinner:
|
||||||
leave=True, unit="Ln") as bar:
|
result = (
|
||||||
for i, df in enumerate(pd.read_csv(txt_file, sep='|', doublequote=False, quoting=csv.QUOTE_NONE,
|
pl.scan_csv(txt_file, separator='|', infer_schema=False, quote_char=None)
|
||||||
dtype=str, na_values='', keep_default_na=False, chunksize=chunk_size)):
|
.filter(pl.col('Libellé profession').is_in(professions))
|
||||||
n_rows = df.shape[0]
|
.with_columns([pl.lit('').alias(c) for c in cols_to_clear])
|
||||||
df.iloc[:, columns_to_clean] = ''
|
.collect()
|
||||||
df = df[df['Libellé profession'].isin(professions)]
|
)
|
||||||
|
spinner.set_description(f'Processing {path.basename(txt_file)} — {len(result)} lignes')
|
||||||
|
|
||||||
if i == 0:
|
# Ajout des données complémentaires des onglets Excel
|
||||||
df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
|
extra = (
|
||||||
lineterminator='\n')
|
pl.concat([df_append, df_etrangers, df_fake, df_sophrolo], how='diagonal_relaxed')
|
||||||
else:
|
.filter(pl.col('Libellé profession').is_in(professions))
|
||||||
df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
|
.with_columns([pl.lit('').alias(c) for c in cols_to_clear if c in df_append.columns])
|
||||||
lineterminator='\n', header=False, mode='a')
|
)
|
||||||
|
|
||||||
bar.update(n_rows)
|
final = pl.concat([result, extra], how='diagonal_relaxed')
|
||||||
bar.close()
|
final.write_csv(output_file, separator='|', quote_style='never', line_terminator='\n')
|
||||||
|
print(f"Written: {path.basename(output_file)} ({len(final)} lignes)")
|
||||||
# Appending Other xls tabs
|
|
||||||
df = pd.concat([df[:0], xls['F_Append_Update'], xls['F_Etrangers'],
|
|
||||||
xls['F_Fake'], xls['F_Sophrologues']], ignore_index=True)
|
|
||||||
df.iloc[:, columns_to_clean] = ''
|
|
||||||
df = df[df['Libellé profession'].isin(professions)]
|
|
||||||
df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
|
|
||||||
lineterminator='\n', header=False, mode='a')
|
|
||||||
|
|
||||||
|
|
||||||
def download_file(url: str, filename: str = False) -> object:
|
def download_file(url: str, filename: str = False) -> object:
|
||||||
|
|||||||
Reference in New Issue
Block a user