professionals_from_sante_fr/prepareProfessionalsTable.py

import argparse
import csv
import math
import sys
from os import path
from shutil import copyfileobj
from zipfile import ZipFile, is_zipfile
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm  # could use from tqdm.gui import tqdm
from tqdm.utils import CallbackIOWrapper
from urllib3.exceptions import InsecureRequestWarning
from urllib3 import disable_warnings
import questionary


def process_professionals_table(xls_file, txt_file, output_file):
    # Load Excel Dataframes
    xls = pd.read_excel(xls_file, sheet_name=None, dtype=str,
                        na_values='', keep_default_na=False)
    professions = xls['F_Professions']['Professions'].tolist()

    # CSV Progressbar initialisation
    estimated_total_rows = sum(1 for _ in open(txt_file, 'rb')) - 1
    chunk_size = 20000

    # Iterating over CSV file
    columns_to_clean = np.r_[0, 2, 4:7, 9, 11:16, 17:28, 30, 35:40, 41:56]
    with tqdm(total=estimated_total_rows, desc=f'Writing to {path.basename(output_file)}',
              leave=True, unit="Ln") as bar:
        for i, df in enumerate(pd.read_csv(txt_file, sep='|', doublequote=False, quoting=csv.QUOTE_NONE,
                                           dtype=str, na_values='', keep_default_na=False, chunksize=chunk_size)):
            n_rows = df.shape[0]
            df.iloc[:, columns_to_clean] = ''
            df = df[df['Libellé profession'].isin(professions)]

            if i == 0:
                df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
                          lineterminator='\n')
            else:
                df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
                          lineterminator='\n', header=False, mode='a')

            bar.update(n_rows)
        bar.close()

    # Appending Other xls tabs
    df = pd.concat([df[:0], xls['F_Append_Update'], xls['F_Etrangers'],
                    xls['F_Fake'], xls['F_Sophrologues']], ignore_index=True)
    df.iloc[:, columns_to_clean] = ''
    df = df[df['Libellé profession'].isin(professions)]
    df.to_csv(output_file, sep='|', index=False, doublequote=False, quoting=csv.QUOTE_NONE,
              lineterminator='\n', header=False, mode='a')


def download_file(url: str, filename: str = False) -> object:
    if not filename:
        local_filename = path.join(".", url.split('/')[-1])
    else:
        local_filename = filename
    disable_warnings(InsecureRequestWarning)
    r = requests.get(url, stream=True, verify=False)
    file_size = int(r.headers['Content-Length'])
    unit_scale = 64

    with open(local_filename, 'wb') as fp:
        for chunk in tqdm(r.iter_content(chunk_size=unit_scale * 1024),
                          total=math.ceil(file_size / 1024 / unit_scale),
                          unit_scale=unit_scale,
                          unit='KB',
                          desc=f"Downloading to {path.basename(local_filename)}",
                          leave=True):
            fp.write(chunk)
    return


def extract_one_file_from_zip(zipfile, fromfile, tofile, desc=False):
    if not desc:
        desc = f"Extracting to {path.basename(tofile)}"
    file = None
    if not is_zipfile(zipfile):
        return f"Can't open Zipfile (non existent or bad): {zipfile}"
    zipf = ZipFile(zipfile)
    for f in zipf.infolist():
        if getattr(f, "filename", "").startswith(fromfile):
            file = f
            break
    if file is None:
        return f"No such file name in the Zip ({fromfile}*)..."

    with zipf, tqdm(
            desc=desc, unit="B", unit_scale=True, unit_divisor=1024,
            total=getattr(file, "file_size", 0), leave=True,
    ) as pbar:
        with zipf.open(file) as fi, open(tofile, "wb") as fo:
            copyfileobj(CallbackIOWrapper(pbar.update, fi), fo)
        pbar.close()


def main():
    defaultFileName = 'Table_Réf_Professionnels'
    defaultExcelFileName = 'Table_Réf_Professionnels'
    internalFileName = 'PS_LibreAcces_Personne_activite'

    parser = argparse.ArgumentParser(description='Prepare Professionals Table for Import to Endoziwig.')
    parser.add_argument('fileName', type=str, nargs='?', default=defaultFileName,
                        help=f'File name to use : default="{defaultFileName}"')
    parser.add_argument('--excelFileName', '-x', type=str, nargs='?', default=defaultExcelFileName,
                        help=f'Excel File Containing Append Data: default="{defaultExcelFileName}" (without extension)')
    parser.add_argument('--noDownload', '-ndw', action='store_true',
                        help='Do not Download the file (Default = Download).')
    parser.add_argument('--noUnzip', '-nuz', action='store_true',
                        help='Do not Unzip the file (Default = Unzip).')
    parser.add_argument('--noProcess', '-npr', action='store_true',
                        help='Do not Process the file (Default = Process).')

    args = parser.parse_args()

    if len(sys.argv) == 1:
        print("You're about to download and prepare Professionals Table for import to Endoziwig")

    # Files Settings
    if args.fileName == defaultFileName:
        print("\n")
        args.fileName = questionary.text("Please confirm file name (or empty to cancel):",
                                         default=defaultFileName).ask()
    if args.fileName == '':
        sys.exit(0)

    BASE_DIR = path.dirname(path.abspath(__file__))
    zipFileName = path.join(BASE_DIR, f'{args.fileName}.zip')
    xlsFileName = path.join(BASE_DIR, f'{args.excelFileName}.xlsx')
    txtFileName = path.join(BASE_DIR, f'{args.fileName}.txt')
    outputFileName = path.join(BASE_DIR, f'{args.fileName}.csv')

    print("\n")

    if not args.noDownload:
        download_file(
            'https://service.annuaire.sante.fr/annuaire-sante-webservices/V300/services/extraction/PS_LibreAcces',
            filename=zipFileName)
        print("\n")

    if not args.noUnzip:
        unzipResult = extract_one_file_from_zip(zipFileName, internalFileName, txtFileName)
        if unzipResult is not None:
            print(unzipResult)
        print("\n")

    if not args.noProcess:
        process_professionals_table(xlsFileName, txtFileName, outputFileName)
        print("\n")


if __name__ == '__main__':
    try :
        main()
    except(Exception) as e :
        print(e)
    finally :
        input('Finished... Press Enter to continue')
        print('\n')