From 67a99a490bc8bba250ef69af5d10ae16a8833e94 Mon Sep 17 00:00:00 2001 From: Abdelkouddous LHACHIMI Date: Fri, 12 Dec 2025 23:45:15 +0100 Subject: [PATCH] Version Fonctionnelle --- .gitignore | 447 +++++++++++++++++++++++++++++++++++++ extract_endoconnect_pdf.py | 259 +++++++++++++++++++++ requirements.txt | 5 + 3 files changed, 711 insertions(+) create mode 100644 .gitignore create mode 100644 extract_endoconnect_pdf.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..877635f --- /dev/null +++ b/.gitignore @@ -0,0 +1,447 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml + +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets +!*.code-workspace + +# Built Visual Studio Code Extensions +*.vsix + +# PDF Files +*.pdf diff --git a/extract_endoconnect_pdf.py b/extract_endoconnect_pdf.py new file mode 100644 index 0000000..684f557 --- /dev/null +++ b/extract_endoconnect_pdf.py @@ -0,0 +1,259 @@ + +import os +import sys +import time +import requests +import pandas as pd +import openpyxl +import questionary +from rich.console import Console +from rich.logging import RichHandler +from rich.progress import Progress, SpinnerColumn, TextColumn +from rich import print as rprint +from rich.panel import Panel +from datetime import datetime + +# ----------------------------------------------------------------------------- +# CONSTANTS & CONFIGURATION +# ----------------------------------------------------------------------------- + +# API Configuration +API_URL = "https://api-endo.ziwig.com" +LOGIN_ENDPOINT = "/api/auth/login" +PDF_ENDPOINT_TEMPLATE = "/api/records/pdf/{}" + +# Defaults (User Configurable via Prompt) +DEFAULT_USER_EMAIL = "abdel.lhachimi@gmail.com" +DEFAULT_USER_PASSWORD = "GU$y#C#Cv73XFKyT3j6^" +DEFAULT_EXCEL_PATH = r"E:\Ziwig Drive\Ziwig Health\Data\Patients\Records_Status.xlsm" +DEFAULT_OUTPUT_ROOT = r"Temp PDF" + +# Constants (Code Configurable Only) +SHEET_TABLE_NAME = "Records_Status" # Name of the Excel Table (ListObject) +COL_PATIENT_ID = "id" # Column name for Patient ID +COL_PATIENT_NAME = "fullName" # Column name for Patient Name +COL_RECORD_FINISHED = "isFinished" # Column name for boolean flag + +# ----------------------------------------------------------------------------- +# SETUP COSOLE +# ----------------------------------------------------------------------------- +console = Console() + +# ----------------------------------------------------------------------------- +# FUNCTIONS +# ----------------------------------------------------------------------------- + +def get_credentials(): + """Prompts for credentials, reiterates on login failure.""" + while True: + email = questionary.text("Enter Endoconnect Email:", default=DEFAULT_USER_EMAIL).ask() + if not email: + console.print("[red]User email cannot be empty. Exiting.[/red]") + sys.exit(1) + + password = questionary.password("Enter Endoconnect Password:", default=DEFAULT_USER_PASSWORD).ask() + + # Verify credentials + with console.status("[bold green]Verifying credentials...") as status: + token = login(email, password) + if token: + console.print("[bold green]Login successful![/bold green]") + return token, email + else: + console.print("[bold red]Login failed. Please try again.[/bold red]") + +def login(email, password): + """Authenticates with the API and returns the token or None.""" + url = f"{API_URL}{LOGIN_ENDPOINT}" + payload = { + "email": email, + "password": password, + "rememberMe": None + } + try: + response = requests.post(url, json=payload, timeout=10) + response.raise_for_status() + data = response.json() + return data.get("token") + except requests.exceptions.RequestException as e: + # console.print(f"[red]API Error: {e}[/red]") + return None + +def get_excel_table_data(file_path, table_name): + """ + Locates an Excel Table by name in any sheet and returns it as a DataFrame. + """ + try: + wb = openpyxl.load_workbook(file_path, data_only=True) + except Exception as e: + console.print(f"[bold red]Error loading Excel file: {e}[/bold red]") + sys.exit(1) + + target_sheet = None + target_range = None + + # Search for the table in all sheets + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + if table_name in ws.tables: + target_sheet = ws + target_range = ws.tables[table_name].ref + break + + if not target_sheet: + console.print(f"[bold red]Table '{table_name}' not found in workbook.[/bold red]") + sys.exit(1) + + # Extract data from the range + data_rows = [] + # ws[target_range] returns a tuple of rows + rows = list(target_sheet[target_range]) + + if not rows: + return pd.DataFrame() + + # First row is header + headers = [cell.value for cell in rows[0]] + + for row in rows[1:]: + values = [cell.value for cell in row] + data_rows.append(values) + + df = pd.DataFrame(data_rows, columns=headers) + return df + +def download_pdf(token, patient_id, output_path, patient_name): + """Downloads the PDF for a patient.""" + url = f"{API_URL}{PDF_ENDPOINT_TEMPLATE.format(patient_id)}" + headers = {"Authorization": f"Bearer {token}"} + + start_time = time.time() + try: + response = requests.get(url, headers=headers, stream=True, timeout=30) + response.raise_for_status() + + with open(output_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + duration = time.time() - start_time + return True, duration, None + except Exception as e: + return False, 0, str(e) + +def sanitize_filename(name): + """Sanitizes the patient name for use as a filename.""" + invalid_chars = '<>:"/\\|?*' + for char in invalid_chars: + name = name.replace(char, '_') + return name.strip() + +# ----------------------------------------------------------------------------- +# MAIN +# ----------------------------------------------------------------------------- + +def main(): + console.print(Panel.fit("[bold blue]Endoconnect Patient PDF Extractor[/bold blue]")) + + # 1. Credentials + token, user_email = get_credentials() + + # 2. Configuration (Excel & Output) + excel_path = questionary.path("Path to Excel file:", default=DEFAULT_EXCEL_PATH).ask() + if not os.path.exists(excel_path): + console.print(f"[bold red]File not found: {excel_path}[/bold red]") + sys.exit(1) + + # Output Directory + today_str = datetime.now().strftime("PDFs-%Y%m%d") + default_output_dir = os.path.join(DEFAULT_OUTPUT_ROOT, today_str) + + output_dir = questionary.path("Output Directory:", default=default_output_dir).ask() + + if not os.path.exists(output_dir): + try: + os.makedirs(output_dir) + console.print(f"[green]Created output directory: {output_dir}[/green]") + except Exception as e: + console.print(f"[bold red]Could not create directory: {e}[/bold red]") + sys.exit(1) + + console.print() # Spacing + + # 3. Read Data + console.print(f"Reading table '{SHEET_TABLE_NAME}' from Excel...") + df = get_excel_table_data(excel_path, SHEET_TABLE_NAME) + console.print() # Spacing + + # Validation + required_cols = [COL_PATIENT_ID, COL_PATIENT_NAME, COL_RECORD_FINISHED] + missing_cols = [c for c in required_cols if c not in df.columns] + if missing_cols: + console.print(f"[bold red]Missing columns in table: {', '.join(missing_cols)}[/bold red]") + sys.exit(1) + + # Filter + # Ensure record_finished is treated as boolean + # Handle various truthy values just in case (though Excel boolean is usually 1/0 or True/False) + # df[COL_RECORD_FINISHED] = df[COL_RECORD_FINISHED].astype(bool) # Might be risky if nulls + + # Safe filtering for True values + patients_to_process = df[df[COL_RECORD_FINISHED] == True] # Direct comparison for boolean or 1 + + total_patients = len(patients_to_process) + console.print(f"[bold]Found {total_patients} patients to process.[/bold]") + console.print() # Add spacing + + if total_patients == 0: + console.print("[yellow]No patients found with record_finished=True. Exiting.[/yellow]") + return + + # 4. Processing Loop + + # Let's refactor loop to be safer + records = patients_to_process.to_dict('records') + + for i, record in enumerate(records, start=1): + p_id = record[COL_PATIENT_ID] + p_name_raw = record[COL_PATIENT_NAME] + + # Normalize Name: Remove extra spaces and Title Case + if p_name_raw: + p_name = " ".join(str(p_name_raw).split()).title() + else: + p_name = "Unknown_Patient" + + safe_name = sanitize_filename(p_name) + filename = f"{safe_name}.pdf" + file_path = os.path.join(output_dir, filename) + + # Display Message with colored filename + # We construct the visible message separately from the status simple text if needed, + # but rich status supports markup. + prefix = f"{i}/{total_patients} -" + msg_colored = f"{prefix} Downloading [bold cyan]{filename}[/bold cyan]" + msg_plain = f"{prefix} Downloading {filename}" + + with console.status(msg_colored, spinner="dots") as status: + success, duration, error = download_pdf(token, p_id, file_path, p_name) + + if success: + console.print(f"[bold green]✓[/bold green] {msg_colored} ({duration:.2f}s)") + else: + console.print(f"[bold red]✗[/bold red] {msg_colored} [red]ERROR: {error}[/red]") + + + console.print(Panel("[bold green]Extraction Finished![/bold green]")) + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + console.print("\n[yellow]Script interrupted by user.[/yellow]") + sys.exit(0) + except Exception as e: + console.print(f"\n[bold red]An unexpected error occurred: {e}[/bold red]") + sys.exit(1) + finally: + console.print() + input("Press Enter to close...") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5fee245 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +requests +rich +questionary +pandas +openpyxl