From 4f01669d61ec0c1f9229aba414ebe1c9ebee5406 Mon Sep 17 00:00:00 2001 From: Lei PAN Date: Sat, 27 Sep 2025 10:49:31 +0200 Subject: [PATCH] add sccript to check passport length --- check_passport_length.py | 237 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 check_passport_length.py diff --git a/check_passport_length.py b/check_passport_length.py new file mode 100644 index 0000000..df2009d --- /dev/null +++ b/check_passport_length.py @@ -0,0 +1,237 @@ +import openpyxl +# import pandas as pd + + +def check_passport_lengths_pandas(file_path, passport_column='passport', name_column='name'): + """ + Check passport lengths in an Excel file using pandas. + + Args: + file_path (str): Path to the Excel file + passport_column (str): Name of the column containing passport numbers (default: 'passport') + name_column (str): Name of the column containing names (default: 'name') + + Returns: + list: List of tuples containing (row_index, name, passport_number, length) for invalid passports + """ + try: + # Read the Excel file + df = pd.read_excel(file_path) + + # If the default 'passport' column doesn't exist, try common variations + available_columns = df.columns.tolist() + if passport_column not in available_columns: + # Look for variations of passport column name + passport_variations = ['passport', 'passport_number', 'passport_num', 'passports'] + found_column = None + for variation in passport_variations: + if variation in available_columns: + found_column = variation + print(f"Info: Using column '{variation}' for passport data instead of '{passport_column}'") + break + + if found_column is None: + print(f"Error: No passport column found. Available columns: {available_columns}") + return [] + else: + passport_column = found_column + + # If the default 'name' column doesn't exist, try common variations + if name_column not in available_columns: + # Look for variations of name column name + name_variations = ['name', 'full_name', 'first_name', 'last_name', 'customer_name'] + found_name_column = None + for variation in name_variations: + if variation in available_columns: + found_name_column = variation + print(f"Info: Using column '{variation}' for name data") + break + + if found_name_column is None: + print(f"Warning: No name column found. Available columns: {available_columns}") + # Use the first column as a fallback + name_column = available_columns[0] if available_columns else 'Unknown' + else: + name_column = found_name_column + + invalid_passports = [] + + for index, row in df.iterrows(): + passport = row[passport_column] + name = row[name_column] if name_column in row else 'Unknown' + + # Handle NaN values + if pd.isna(passport): + invalid_passports.append((index + 2, name, 'NaN', 0)) # +2 to match Excel row numbers (header + 0-index) + continue + + # Convert to string to ensure we can check the length + passport_str = str(passport).strip() + length = len(passport_str) + + # Check if the length is not 9 + if length != 9: + invalid_passports.append((index + 2, name, passport_str, length)) # +2 to match Excel row numbers + + return invalid_passports + + except ImportError: + # Pandas is not available, use openpyxl + return check_passport_lengths_openpyxl(file_path, passport_column, name_column) + except FileNotFoundError: + print(f"Error: File '{file_path}' not found.") + return [] + except Exception as e: + print(f"Error reading the file with pandas: {str(e)}") + # Fall back to openpyxl if pandas fails + return check_passport_lengths_openpyxl(file_path, passport_column, name_column) + +def check_passport_lengths_openpyxl(file_path, passport_column_header='passport', name_column_header='name'): + """ + Check passport lengths in an Excel file using openpyxl. + + Args: + file_path (str): Path to the Excel file + passport_column_header (str): Name of the column header containing passport numbers (default: 'passport') + name_column_header (str): Name of the column header containing names (default: 'name') + + Returns: + list: List of tuples containing (row_index, name, passport_number, length) for invalid passports + """ + try: + # Load the workbook and select the active worksheet + wb = openpyxl.load_workbook(file_path) + ws = wb.active + + # Find the column index for passport based on header + passport_col_idx = None + name_col_idx = None + + for col_idx, cell in enumerate(ws[1], 1): # ws[1] gets the first row + if cell.value == passport_column_header: + passport_col_idx = col_idx + if cell.value == name_column_header: + name_col_idx = col_idx + + # If the default passport column doesn't exist, try common variations + if passport_col_idx is None: + passport_variations = ['passport', 'passport_number', 'passport_num', 'passports'] + for variation in passport_variations: + for col_idx, cell in enumerate(ws[1], 1): + if cell.value == variation: + passport_col_idx = col_idx + print(f"Info: Using column '{variation}' for passport data") + break + if passport_col_idx is not None: + break + + # If the default name column doesn't exist, try common variations + if name_col_idx is None: + name_variations = ['name', 'full_name', 'first_name', 'last_name', 'customer_name'] + for variation in name_variations: + for col_idx, cell in enumerate(ws[1], 1): + if cell.value == variation: + name_col_idx = col_idx + print(f"Info: Using column '{variation}' for name data") + break + if name_col_idx is not None: + break + + if passport_col_idx is None: + # Print available headers for user reference + headers = [cell.value for cell in ws[1] if cell.value is not None] + print(f"Error: No passport column found. Available columns: {headers}") + return [] + + if name_col_idx is None: + # If name column is not found, warn but continue to process + print(f"Warning: No name column found. Available columns: {[cell.value for cell in ws[1] if cell.value is not None]}") + # Use the first column as name to maintain backward compatibility + name_col_idx = 1 + + invalid_passports = [] + + # Start from row 2 to skip header + for row_idx, row in enumerate(ws.iter_rows(min_row=2, values_only=True), start=2): + # Check if the row has enough columns + if len(row) >= max(passport_col_idx, name_col_idx): + passport = row[passport_col_idx - 1] # Adjust for 0-based indexing + name = row[name_col_idx - 1] if len(row) >= name_col_idx else 'Unknown' + + # Handle None values for passport + if passport is None: + invalid_passports.append((row_idx, name, 'None', 0)) + continue + + # Convert to string to ensure we can check the length + passport_str = str(passport).strip() + length = len(passport_str) + + # Check if the length is not 9 + if length != 9: + invalid_passports.append((row_idx, name, passport_str, length)) + + return invalid_passports + + except FileNotFoundError: + print(f"Error: File '{file_path}' not found.") + return [] + except Exception as e: + print(f"Error reading the file with openpyxl: {str(e)}") + return [] + +def check_passport_lengths(file_path, passport_column='passport', name_column='name'): + """ + Check passport lengths in an Excel file using pandas if available, otherwise openpyxl. + + Args: + file_path (str): Path to the Excel file + passport_column (str): Name of the column containing passport numbers (default: 'passport') + name_column (str): Name of the column containing names (default: 'name') + + Returns: + list: List of tuples containing (row_index, name, passport_number, length) for invalid passports + """ + # Try with pandas first, fall back to openpyxl if pandas fails + try: + import pandas as pd + return check_passport_lengths_pandas(file_path, passport_column, name_column) + + except ImportError: + # Pandas is not available or has compatibility issues, use openpyxl + return check_passport_lengths_openpyxl(file_path, passport_column, name_column) + except FileNotFoundError: + print(f"Error: File '{file_path}' not found.") + return [] + except Exception as e: + print(f"Error reading the file with pandas: {str(e)}") + # Fall back to openpyxl if pandas fails + return check_passport_lengths_openpyxl(file_path, passport_column, name_column) + +def main(): + import sys + if len(sys.argv) > 1: + file_path = sys.argv[1] + else: + file_path = 'real_name_contacts_100_aol_17_04.xlsx' + + print(f"Checking passport lengths in file: {file_path}") + print("-" * 50) + + # Check passport lengths + invalid_passports = check_passport_lengths(file_path) + + if not invalid_passports: + print("All passports have valid length (9 characters).") + else: + print(f"Found {len(invalid_passports)} invalid passport(s):") + print("") + + for row_idx, name, passport, length in invalid_passports: + print(f"Row {row_idx}: Name '{name}', Passport '{passport}' has length {length} (should be 9)") + + print("") + print(f"Summary: {len(invalid_passports)} invalid passport(s) found out of total entries.") + +if __name__ == "__main__": + main() \ No newline at end of file