import openpyxl # import pandas as pd def check_passport_lengths_pandas(file_path, passport_column='passport', name_column='name'): """ Check passport lengths in an Excel file using pandas. Args: file_path (str): Path to the Excel file passport_column (str): Name of the column containing passport numbers (default: 'passport') name_column (str): Name of the column containing names (default: 'name') Returns: list: List of tuples containing (row_index, name, passport_number, length) for invalid passports """ try: # Read the Excel file df = pd.read_excel(file_path) # If the default 'passport' column doesn't exist, try common variations available_columns = df.columns.tolist() if passport_column not in available_columns: # Look for variations of passport column name passport_variations = ['passport', 'passport_number', 'passport_num', 'passports'] found_column = None for variation in passport_variations: if variation in available_columns: found_column = variation print(f"Info: Using column '{variation}' for passport data instead of '{passport_column}'") break if found_column is None: print(f"Error: No passport column found. Available columns: {available_columns}") return [] else: passport_column = found_column # If the default 'name' column doesn't exist, try common variations if name_column not in available_columns: # Look for variations of name column name name_variations = ['name', 'full_name', 'first_name', 'last_name', 'customer_name'] found_name_column = None for variation in name_variations: if variation in available_columns: found_name_column = variation print(f"Info: Using column '{variation}' for name data") break if found_name_column is None: print(f"Warning: No name column found. Available columns: {available_columns}") # Use the first column as a fallback name_column = available_columns[0] if available_columns else 'Unknown' else: name_column = found_name_column invalid_passports = [] for index, row in df.iterrows(): passport = row[passport_column] name = row[name_column] if name_column in row else 'Unknown' # Handle NaN values if pd.isna(passport): invalid_passports.append((index + 2, name, 'NaN', 0)) # +2 to match Excel row numbers (header + 0-index) continue # Convert to string to ensure we can check the length passport_str = str(passport).strip() length = len(passport_str) # Check if the length is not 9 if length != 9: invalid_passports.append((index + 2, name, passport_str, length)) # +2 to match Excel row numbers return invalid_passports except ImportError: # Pandas is not available, use openpyxl return check_passport_lengths_openpyxl(file_path, passport_column, name_column) except FileNotFoundError: print(f"Error: File '{file_path}' not found.") return [] except Exception as e: print(f"Error reading the file with pandas: {str(e)}") # Fall back to openpyxl if pandas fails return check_passport_lengths_openpyxl(file_path, passport_column, name_column) def check_passport_lengths_openpyxl(file_path, passport_column_header='passport', name_column_header='name'): """ Check passport lengths in an Excel file using openpyxl. Args: file_path (str): Path to the Excel file passport_column_header (str): Name of the column header containing passport numbers (default: 'passport') name_column_header (str): Name of the column header containing names (default: 'name') Returns: list: List of tuples containing (row_index, name, passport_number, length) for invalid passports """ try: # Load the workbook and select the active worksheet wb = openpyxl.load_workbook(file_path) ws = wb.active # Find the column index for passport based on header passport_col_idx = None name_col_idx = None for col_idx, cell in enumerate(ws[1], 1): # ws[1] gets the first row if cell.value == passport_column_header: passport_col_idx = col_idx if cell.value == name_column_header: name_col_idx = col_idx # If the default passport column doesn't exist, try common variations if passport_col_idx is None: passport_variations = ['passport', 'passport_number', 'passport_num', 'passports'] for variation in passport_variations: for col_idx, cell in enumerate(ws[1], 1): if cell.value == variation: passport_col_idx = col_idx print(f"Info: Using column '{variation}' for passport data") break if passport_col_idx is not None: break # If the default name column doesn't exist, try common variations if name_col_idx is None: name_variations = ['name', 'full_name', 'first_name', 'last_name', 'customer_name'] for variation in name_variations: for col_idx, cell in enumerate(ws[1], 1): if cell.value == variation: name_col_idx = col_idx print(f"Info: Using column '{variation}' for name data") break if name_col_idx is not None: break if passport_col_idx is None: # Print available headers for user reference headers = [cell.value for cell in ws[1] if cell.value is not None] print(f"Error: No passport column found. Available columns: {headers}") return [] if name_col_idx is None: # If name column is not found, warn but continue to process print(f"Warning: No name column found. Available columns: {[cell.value for cell in ws[1] if cell.value is not None]}") # Use the first column as name to maintain backward compatibility name_col_idx = 1 invalid_passports = [] # Start from row 2 to skip header for row_idx, row in enumerate(ws.iter_rows(min_row=2, values_only=True), start=2): # Check if the row has enough columns if len(row) >= max(passport_col_idx, name_col_idx): passport = row[passport_col_idx - 1] # Adjust for 0-based indexing name = row[name_col_idx - 1] if len(row) >= name_col_idx else 'Unknown' # Handle None values for passport if passport is None: invalid_passports.append((row_idx, name, 'None', 0)) continue # Convert to string to ensure we can check the length passport_str = str(passport).strip() length = len(passport_str) # Check if the length is not 9 if length != 9: invalid_passports.append((row_idx, name, passport_str, length)) return invalid_passports except FileNotFoundError: print(f"Error: File '{file_path}' not found.") return [] except Exception as e: print(f"Error reading the file with openpyxl: {str(e)}") return [] def check_passport_lengths(file_path, passport_column='passport', name_column='name'): """ Check passport lengths in an Excel file using pandas if available, otherwise openpyxl. Args: file_path (str): Path to the Excel file passport_column (str): Name of the column containing passport numbers (default: 'passport') name_column (str): Name of the column containing names (default: 'name') Returns: list: List of tuples containing (row_index, name, passport_number, length) for invalid passports """ # Try with pandas first, fall back to openpyxl if pandas fails try: import pandas as pd return check_passport_lengths_pandas(file_path, passport_column, name_column) except ImportError: # Pandas is not available or has compatibility issues, use openpyxl return check_passport_lengths_openpyxl(file_path, passport_column, name_column) except FileNotFoundError: print(f"Error: File '{file_path}' not found.") return [] except Exception as e: print(f"Error reading the file with pandas: {str(e)}") # Fall back to openpyxl if pandas fails return check_passport_lengths_openpyxl(file_path, passport_column, name_column) def main(): import sys if len(sys.argv) > 1: file_path = sys.argv[1] else: file_path = 'real_name_contacts_100_aol_17_04.xlsx' print(f"Checking passport lengths in file: {file_path}") print("-" * 50) # Check passport lengths invalid_passports = check_passport_lengths(file_path) if not invalid_passports: print("All passports have valid length (9 characters).") else: print(f"Found {len(invalid_passports)} invalid passport(s):") print("") for row_idx, name, passport, length in invalid_passports: print(f"Row {row_idx}: Name '{name}', Passport '{passport}' has length {length} (should be 9)") print("") print(f"Summary: {len(invalid_passports)} invalid passport(s) found out of total entries.") if __name__ == "__main__": main()