add sccript to check passport length
This commit is contained in:
@@ -0,0 +1,237 @@
|
|||||||
|
import openpyxl
|
||||||
|
# import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def check_passport_lengths_pandas(file_path, passport_column='passport', name_column='name'):
|
||||||
|
"""
|
||||||
|
Check passport lengths in an Excel file using pandas.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): Path to the Excel file
|
||||||
|
passport_column (str): Name of the column containing passport numbers (default: 'passport')
|
||||||
|
name_column (str): Name of the column containing names (default: 'name')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: List of tuples containing (row_index, name, passport_number, length) for invalid passports
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Read the Excel file
|
||||||
|
df = pd.read_excel(file_path)
|
||||||
|
|
||||||
|
# If the default 'passport' column doesn't exist, try common variations
|
||||||
|
available_columns = df.columns.tolist()
|
||||||
|
if passport_column not in available_columns:
|
||||||
|
# Look for variations of passport column name
|
||||||
|
passport_variations = ['passport', 'passport_number', 'passport_num', 'passports']
|
||||||
|
found_column = None
|
||||||
|
for variation in passport_variations:
|
||||||
|
if variation in available_columns:
|
||||||
|
found_column = variation
|
||||||
|
print(f"Info: Using column '{variation}' for passport data instead of '{passport_column}'")
|
||||||
|
break
|
||||||
|
|
||||||
|
if found_column is None:
|
||||||
|
print(f"Error: No passport column found. Available columns: {available_columns}")
|
||||||
|
return []
|
||||||
|
else:
|
||||||
|
passport_column = found_column
|
||||||
|
|
||||||
|
# If the default 'name' column doesn't exist, try common variations
|
||||||
|
if name_column not in available_columns:
|
||||||
|
# Look for variations of name column name
|
||||||
|
name_variations = ['name', 'full_name', 'first_name', 'last_name', 'customer_name']
|
||||||
|
found_name_column = None
|
||||||
|
for variation in name_variations:
|
||||||
|
if variation in available_columns:
|
||||||
|
found_name_column = variation
|
||||||
|
print(f"Info: Using column '{variation}' for name data")
|
||||||
|
break
|
||||||
|
|
||||||
|
if found_name_column is None:
|
||||||
|
print(f"Warning: No name column found. Available columns: {available_columns}")
|
||||||
|
# Use the first column as a fallback
|
||||||
|
name_column = available_columns[0] if available_columns else 'Unknown'
|
||||||
|
else:
|
||||||
|
name_column = found_name_column
|
||||||
|
|
||||||
|
invalid_passports = []
|
||||||
|
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
passport = row[passport_column]
|
||||||
|
name = row[name_column] if name_column in row else 'Unknown'
|
||||||
|
|
||||||
|
# Handle NaN values
|
||||||
|
if pd.isna(passport):
|
||||||
|
invalid_passports.append((index + 2, name, 'NaN', 0)) # +2 to match Excel row numbers (header + 0-index)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Convert to string to ensure we can check the length
|
||||||
|
passport_str = str(passport).strip()
|
||||||
|
length = len(passport_str)
|
||||||
|
|
||||||
|
# Check if the length is not 9
|
||||||
|
if length != 9:
|
||||||
|
invalid_passports.append((index + 2, name, passport_str, length)) # +2 to match Excel row numbers
|
||||||
|
|
||||||
|
return invalid_passports
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
# Pandas is not available, use openpyxl
|
||||||
|
return check_passport_lengths_openpyxl(file_path, passport_column, name_column)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"Error: File '{file_path}' not found.")
|
||||||
|
return []
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading the file with pandas: {str(e)}")
|
||||||
|
# Fall back to openpyxl if pandas fails
|
||||||
|
return check_passport_lengths_openpyxl(file_path, passport_column, name_column)
|
||||||
|
|
||||||
|
def check_passport_lengths_openpyxl(file_path, passport_column_header='passport', name_column_header='name'):
|
||||||
|
"""
|
||||||
|
Check passport lengths in an Excel file using openpyxl.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): Path to the Excel file
|
||||||
|
passport_column_header (str): Name of the column header containing passport numbers (default: 'passport')
|
||||||
|
name_column_header (str): Name of the column header containing names (default: 'name')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: List of tuples containing (row_index, name, passport_number, length) for invalid passports
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Load the workbook and select the active worksheet
|
||||||
|
wb = openpyxl.load_workbook(file_path)
|
||||||
|
ws = wb.active
|
||||||
|
|
||||||
|
# Find the column index for passport based on header
|
||||||
|
passport_col_idx = None
|
||||||
|
name_col_idx = None
|
||||||
|
|
||||||
|
for col_idx, cell in enumerate(ws[1], 1): # ws[1] gets the first row
|
||||||
|
if cell.value == passport_column_header:
|
||||||
|
passport_col_idx = col_idx
|
||||||
|
if cell.value == name_column_header:
|
||||||
|
name_col_idx = col_idx
|
||||||
|
|
||||||
|
# If the default passport column doesn't exist, try common variations
|
||||||
|
if passport_col_idx is None:
|
||||||
|
passport_variations = ['passport', 'passport_number', 'passport_num', 'passports']
|
||||||
|
for variation in passport_variations:
|
||||||
|
for col_idx, cell in enumerate(ws[1], 1):
|
||||||
|
if cell.value == variation:
|
||||||
|
passport_col_idx = col_idx
|
||||||
|
print(f"Info: Using column '{variation}' for passport data")
|
||||||
|
break
|
||||||
|
if passport_col_idx is not None:
|
||||||
|
break
|
||||||
|
|
||||||
|
# If the default name column doesn't exist, try common variations
|
||||||
|
if name_col_idx is None:
|
||||||
|
name_variations = ['name', 'full_name', 'first_name', 'last_name', 'customer_name']
|
||||||
|
for variation in name_variations:
|
||||||
|
for col_idx, cell in enumerate(ws[1], 1):
|
||||||
|
if cell.value == variation:
|
||||||
|
name_col_idx = col_idx
|
||||||
|
print(f"Info: Using column '{variation}' for name data")
|
||||||
|
break
|
||||||
|
if name_col_idx is not None:
|
||||||
|
break
|
||||||
|
|
||||||
|
if passport_col_idx is None:
|
||||||
|
# Print available headers for user reference
|
||||||
|
headers = [cell.value for cell in ws[1] if cell.value is not None]
|
||||||
|
print(f"Error: No passport column found. Available columns: {headers}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name_col_idx is None:
|
||||||
|
# If name column is not found, warn but continue to process
|
||||||
|
print(f"Warning: No name column found. Available columns: {[cell.value for cell in ws[1] if cell.value is not None]}")
|
||||||
|
# Use the first column as name to maintain backward compatibility
|
||||||
|
name_col_idx = 1
|
||||||
|
|
||||||
|
invalid_passports = []
|
||||||
|
|
||||||
|
# Start from row 2 to skip header
|
||||||
|
for row_idx, row in enumerate(ws.iter_rows(min_row=2, values_only=True), start=2):
|
||||||
|
# Check if the row has enough columns
|
||||||
|
if len(row) >= max(passport_col_idx, name_col_idx):
|
||||||
|
passport = row[passport_col_idx - 1] # Adjust for 0-based indexing
|
||||||
|
name = row[name_col_idx - 1] if len(row) >= name_col_idx else 'Unknown'
|
||||||
|
|
||||||
|
# Handle None values for passport
|
||||||
|
if passport is None:
|
||||||
|
invalid_passports.append((row_idx, name, 'None', 0))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Convert to string to ensure we can check the length
|
||||||
|
passport_str = str(passport).strip()
|
||||||
|
length = len(passport_str)
|
||||||
|
|
||||||
|
# Check if the length is not 9
|
||||||
|
if length != 9:
|
||||||
|
invalid_passports.append((row_idx, name, passport_str, length))
|
||||||
|
|
||||||
|
return invalid_passports
|
||||||
|
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"Error: File '{file_path}' not found.")
|
||||||
|
return []
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading the file with openpyxl: {str(e)}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def check_passport_lengths(file_path, passport_column='passport', name_column='name'):
|
||||||
|
"""
|
||||||
|
Check passport lengths in an Excel file using pandas if available, otherwise openpyxl.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): Path to the Excel file
|
||||||
|
passport_column (str): Name of the column containing passport numbers (default: 'passport')
|
||||||
|
name_column (str): Name of the column containing names (default: 'name')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: List of tuples containing (row_index, name, passport_number, length) for invalid passports
|
||||||
|
"""
|
||||||
|
# Try with pandas first, fall back to openpyxl if pandas fails
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
return check_passport_lengths_pandas(file_path, passport_column, name_column)
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
# Pandas is not available or has compatibility issues, use openpyxl
|
||||||
|
return check_passport_lengths_openpyxl(file_path, passport_column, name_column)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"Error: File '{file_path}' not found.")
|
||||||
|
return []
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading the file with pandas: {str(e)}")
|
||||||
|
# Fall back to openpyxl if pandas fails
|
||||||
|
return check_passport_lengths_openpyxl(file_path, passport_column, name_column)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import sys
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
file_path = sys.argv[1]
|
||||||
|
else:
|
||||||
|
file_path = 'real_name_contacts_100_aol_17_04.xlsx'
|
||||||
|
|
||||||
|
print(f"Checking passport lengths in file: {file_path}")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
# Check passport lengths
|
||||||
|
invalid_passports = check_passport_lengths(file_path)
|
||||||
|
|
||||||
|
if not invalid_passports:
|
||||||
|
print("All passports have valid length (9 characters).")
|
||||||
|
else:
|
||||||
|
print(f"Found {len(invalid_passports)} invalid passport(s):")
|
||||||
|
print("")
|
||||||
|
|
||||||
|
for row_idx, name, passport, length in invalid_passports:
|
||||||
|
print(f"Row {row_idx}: Name '{name}', Passport '{passport}' has length {length} (should be 9)")
|
||||||
|
|
||||||
|
print("")
|
||||||
|
print(f"Summary: {len(invalid_passports)} invalid passport(s) found out of total entries.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user