Files
appointment_tool/check_passport_length.py

237 lines
10 KiB
Python

import openpyxl
# import pandas as pd
def check_passport_lengths_pandas(file_path, passport_column='passport', name_column='name'):
"""
Check passport lengths in an Excel file using pandas.
Args:
file_path (str): Path to the Excel file
passport_column (str): Name of the column containing passport numbers (default: 'passport')
name_column (str): Name of the column containing names (default: 'name')
Returns:
list: List of tuples containing (row_index, name, passport_number, length) for invalid passports
"""
try:
# Read the Excel file
df = pd.read_excel(file_path)
# If the default 'passport' column doesn't exist, try common variations
available_columns = df.columns.tolist()
if passport_column not in available_columns:
# Look for variations of passport column name
passport_variations = ['passport', 'passport_number', 'passport_num', 'passports']
found_column = None
for variation in passport_variations:
if variation in available_columns:
found_column = variation
print(f"Info: Using column '{variation}' for passport data instead of '{passport_column}'")
break
if found_column is None:
print(f"Error: No passport column found. Available columns: {available_columns}")
return []
else:
passport_column = found_column
# If the default 'name' column doesn't exist, try common variations
if name_column not in available_columns:
# Look for variations of name column name
name_variations = ['name', 'full_name', 'first_name', 'last_name', 'customer_name']
found_name_column = None
for variation in name_variations:
if variation in available_columns:
found_name_column = variation
print(f"Info: Using column '{variation}' for name data")
break
if found_name_column is None:
print(f"Warning: No name column found. Available columns: {available_columns}")
# Use the first column as a fallback
name_column = available_columns[0] if available_columns else 'Unknown'
else:
name_column = found_name_column
invalid_passports = []
for index, row in df.iterrows():
passport = row[passport_column]
name = row[name_column] if name_column in row else 'Unknown'
# Handle NaN values
if pd.isna(passport):
invalid_passports.append((index + 2, name, 'NaN', 0)) # +2 to match Excel row numbers (header + 0-index)
continue
# Convert to string to ensure we can check the length
passport_str = str(passport).strip()
length = len(passport_str)
# Check if the length is not 9
if length != 9:
invalid_passports.append((index + 2, name, passport_str, length)) # +2 to match Excel row numbers
return invalid_passports
except ImportError:
# Pandas is not available, use openpyxl
return check_passport_lengths_openpyxl(file_path, passport_column, name_column)
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
return []
except Exception as e:
print(f"Error reading the file with pandas: {str(e)}")
# Fall back to openpyxl if pandas fails
return check_passport_lengths_openpyxl(file_path, passport_column, name_column)
def check_passport_lengths_openpyxl(file_path, passport_column_header='passport', name_column_header='name'):
"""
Check passport lengths in an Excel file using openpyxl.
Args:
file_path (str): Path to the Excel file
passport_column_header (str): Name of the column header containing passport numbers (default: 'passport')
name_column_header (str): Name of the column header containing names (default: 'name')
Returns:
list: List of tuples containing (row_index, name, passport_number, length) for invalid passports
"""
try:
# Load the workbook and select the active worksheet
wb = openpyxl.load_workbook(file_path)
ws = wb.active
# Find the column index for passport based on header
passport_col_idx = None
name_col_idx = None
for col_idx, cell in enumerate(ws[1], 1): # ws[1] gets the first row
if cell.value == passport_column_header:
passport_col_idx = col_idx
if cell.value == name_column_header:
name_col_idx = col_idx
# If the default passport column doesn't exist, try common variations
if passport_col_idx is None:
passport_variations = ['passport', 'passport_number', 'passport_num', 'passports']
for variation in passport_variations:
for col_idx, cell in enumerate(ws[1], 1):
if cell.value == variation:
passport_col_idx = col_idx
print(f"Info: Using column '{variation}' for passport data")
break
if passport_col_idx is not None:
break
# If the default name column doesn't exist, try common variations
if name_col_idx is None:
name_variations = ['name', 'full_name', 'first_name', 'last_name', 'customer_name']
for variation in name_variations:
for col_idx, cell in enumerate(ws[1], 1):
if cell.value == variation:
name_col_idx = col_idx
print(f"Info: Using column '{variation}' for name data")
break
if name_col_idx is not None:
break
if passport_col_idx is None:
# Print available headers for user reference
headers = [cell.value for cell in ws[1] if cell.value is not None]
print(f"Error: No passport column found. Available columns: {headers}")
return []
if name_col_idx is None:
# If name column is not found, warn but continue to process
print(f"Warning: No name column found. Available columns: {[cell.value for cell in ws[1] if cell.value is not None]}")
# Use the first column as name to maintain backward compatibility
name_col_idx = 1
invalid_passports = []
# Start from row 2 to skip header
for row_idx, row in enumerate(ws.iter_rows(min_row=2, values_only=True), start=2):
# Check if the row has enough columns
if len(row) >= max(passport_col_idx, name_col_idx):
passport = row[passport_col_idx - 1] # Adjust for 0-based indexing
name = row[name_col_idx - 1] if len(row) >= name_col_idx else 'Unknown'
# Handle None values for passport
if passport is None:
invalid_passports.append((row_idx, name, 'None', 0))
continue
# Convert to string to ensure we can check the length
passport_str = str(passport).strip()
length = len(passport_str)
# Check if the length is not 9
if length != 9:
invalid_passports.append((row_idx, name, passport_str, length))
return invalid_passports
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
return []
except Exception as e:
print(f"Error reading the file with openpyxl: {str(e)}")
return []
def check_passport_lengths(file_path, passport_column='passport', name_column='name'):
"""
Check passport lengths in an Excel file using pandas if available, otherwise openpyxl.
Args:
file_path (str): Path to the Excel file
passport_column (str): Name of the column containing passport numbers (default: 'passport')
name_column (str): Name of the column containing names (default: 'name')
Returns:
list: List of tuples containing (row_index, name, passport_number, length) for invalid passports
"""
# Try with pandas first, fall back to openpyxl if pandas fails
try:
import pandas as pd
return check_passport_lengths_pandas(file_path, passport_column, name_column)
except ImportError:
# Pandas is not available or has compatibility issues, use openpyxl
return check_passport_lengths_openpyxl(file_path, passport_column, name_column)
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
return []
except Exception as e:
print(f"Error reading the file with pandas: {str(e)}")
# Fall back to openpyxl if pandas fails
return check_passport_lengths_openpyxl(file_path, passport_column, name_column)
def main():
import sys
if len(sys.argv) > 1:
file_path = sys.argv[1]
else:
file_path = 'real_name_contacts_100_aol_17_04.xlsx'
print(f"Checking passport lengths in file: {file_path}")
print("-" * 50)
# Check passport lengths
invalid_passports = check_passport_lengths(file_path)
if not invalid_passports:
print("All passports have valid length (9 characters).")
else:
print(f"Found {len(invalid_passports)} invalid passport(s):")
print("")
for row_idx, name, passport, length in invalid_passports:
print(f"Row {row_idx}: Name '{name}', Passport '{passport}' has length {length} (should be 9)")
print("")
print(f"Summary: {len(invalid_passports)} invalid passport(s) found out of total entries.")
if __name__ == "__main__":
main()