python读取excel数据的详细教学-华金科技

Python 读取Excel数据详细教学

安装必要的库

# 安装 pandas（最常用）
pip install pandas openpyxl xlrd

# 安装 openpyxl（用于.xlsx文件）
pip install openpyxl

# 安装 xlrd（用于.xls文件，新版只支持读取）
pip install xlrd

# 可选：安装 xlsxwriter（用于写入）
pip install xlsxwriter

基本读取操作

1. 使用 pandas（推荐）

import pandas as pd

# 基本读取
df = pd.read_excel('data.xlsx')
print(df.head())  # 查看前5行

# 指定工作表
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# 或使用索引
df = pd.read_excel('data.xlsx', sheet_name=0)

# 指定读取范围
df = pd.read_excel('data.xlsx', 
                   usecols='A:C',  # 只读取A-C列
                   nrows=100)       # 只读取前100行

# 读取指定列
df = pd.read_excel('data.xlsx', 
                   usecols=['姓名', '年龄', '部门'])

# 处理表头
df = pd.read_excel('data.xlsx', 
                   header=0,        # 第0行作为表头
                   names=['col1', 'col2', 'col3'])  # 自定义列名

# 跳过行
df = pd.read_excel('data.xlsx', 
                   skiprows=2,     # 跳过前2行
                   skipfooter=3)   # 跳过最后3行

2. 使用 openpyxl

from openpyxl import load_workbook

# 加载工作簿
wb = load_workbook(filename='data.xlsx', 
                   read_only=True,  # 只读模式，处理大文件
                   data_only=True)  # 只读取值，不读取公式

# 选择工作表
ws = wb['Sheet1']  # 按名称
# 或
ws = wb.active     # 活动工作表

# 读取单个单元格
cell_value = ws['A1'].value
print(cell_value)

# 读取多个单元格
for row in ws.iter_rows(min_row=2, max_col=3, max_row=10, values_only=True):
    print(row)

# 读取所有数据
data = []
for row in ws.iter_rows(values_only=True):
    data.append(row)

3. 使用 xlrd（适合.xls文件）

import xlrd

# 打开工作簿
workbook = xlrd.open_workbook('data.xls')

# 获取工作表
sheet = workbook.sheet_by_index(0)  # 按索引
# 或
sheet = workbook.sheet_by_name('Sheet1')  # 按名称

# 读取数据
print(f"工作表名称: {sheet.name}")
print(f"行数: {sheet.nrows}")
print(f"列数: {sheet.ncols}")

# 读取单元格
cell_value = sheet.cell_value(0, 0)  # 第0行第0列
print(cell_value)

# 读取整行
row_values = sheet.row_values(0)  # 第0行
print(row_values)

# 读取整列
col_values = sheet.col_values(0)  # 第0列
print(col_values)

常用库对比

特性	pandas	openpyxl	xlrd
文件格式	.xlsx, .xls	.xlsx	.xls
读取速度	快	中等	快
写入支持	是	是	否
公式支持	读取结果	可读取公式	读取结果
大文件支持	好	好（只读模式）	好
功能丰富度	★★★★★	★★★★☆	★★★☆☆

详细示例

示例1：完整的数据处理流程

import pandas as pd
import numpy as np

def process_excel_data(file_path):
    """
    完整的Excel数据处理示例
    """
    try:
        # 1. 读取数据
        df = pd.read_excel(file_path)

        # 2. 查看基本信息
        print("=== 数据基本信息 ===")
        print(f"形状: {df.shape}")
        print(f"列名: {list(df.columns)}")
        print(f"数据类型:\n{df.dtypes}")

        # 3. 查看前几行数据
        print("\n=== 前5行数据 ===")
        print(df.head())

        # 4. 统计信息
        print("\n=== 统计信息 ===")
        print(df.describe())

        # 5. 检查缺失值
        print("\n=== 缺失值统计 ===")
        print(df.isnull().sum())

        # 6. 处理缺失值
        df_cleaned = df.fillna({
            '数值列': df['数值列'].mean(),
            '文本列': '未知'
        })

        # 7. 数据转换
        if '日期列' in df.columns:
            df_cleaned['日期列'] = pd.to_datetime(df_cleaned['日期列'])

        # 8. 筛选数据
        filtered_data = df_cleaned[df_cleaned['年龄'] > 18]

        # 9. 分组统计
        if '部门' in df.columns and '工资' in df.columns:
            group_stats = df_cleaned.groupby('部门')['工资'].agg(['mean', 'sum', 'count'])
            print("\n=== 部门工资统计 ===")
            print(group_stats)

        return df_cleaned

    except Exception as e:
        print(f"读取Excel文件时出错: {e}")
        return None

# 使用示例
df = process_excel_data('员工数据.xlsx')

示例2：读取多个工作表

import pandas as pd

def read_multiple_sheets(file_path):
    """
    读取Excel文件中的所有工作表
    """
    # 方法1：读取所有工作表到字典
    all_sheets = pd.read_excel(file_path, sheet_name=None)

    print(f"工作表数量: {len(all_sheets)}")

    for sheet_name, df in all_sheets.items():
        print(f"\n工作表: {sheet_name}")
        print(f"数据形状: {df.shape}")
        print(df.head(3))

    # 方法2：读取特定工作表
    sheet_list = ['员工信息', '部门信息']
    selected_sheets = {}

    for sheet in sheet_list:
        try:
            df = pd.read_excel(file_path, sheet_name=sheet)
            selected_sheets[sheet] = df
        except:
            print(f"工作表 {sheet} 不存在")

    return all_sheets, selected_sheets

# 使用示例
all_data, selected_data = read_multiple_sheets('公司数据.xlsx')

高级操作

1. 处理大数据文件

import pandas as pd

def read_large_excel(file_path, chunk_size=1000):
    """
    分块读取大Excel文件
    """
    chunks = []

    # 分块读取
    for chunk in pd.read_excel(file_path, 
                               chunksize=chunk_size,
                               engine='openpyxl'):
        # 对每个块进行处理
        processed_chunk = chunk.dropna()  # 示例：删除缺失值
        chunks.append(processed_chunk)

    # 合并所有块
    df = pd.concat(chunks, ignore_index=True)

    return df

# 或使用openpyxl的只读模式
def read_large_excel_openpyxl(file_path):
    from openpyxl import load_workbook

    wb = load_workbook(filename=file_path, read_only=True)
    ws = wb.active

    data = []
    for row in ws.iter_rows(values_only=True):
        data.append(row)

    df = pd.DataFrame(data[1:], columns=data[0])
    return df

2. 读取带格式的数据

from openpyxl import load_workbook

def read_with_formatting(file_path):
    """
    读取单元格格式信息
    """
    wb = load_workbook(filename=file_path)
    ws = wb.active

    cell_data = []

    for row in ws.iter_rows(min_row=1, max_col=5, max_row=10):
        row_data = []
        for cell in row:
            cell_info = {
                'value': cell.value,
                'font_color': cell.font.color.rgb if cell.font.color else None,
                'fill_color': cell.fill.fgColor.rgb if cell.fill else None,
                'number_format': cell.number_format,
                'alignment': cell.alignment.horizontal
            }
            row_data.append(cell_info)
        cell_data.append(row_data)

    return cell_data

3. 处理合并单元格

import pandas as pd
from openpyxl import load_workbook

def handle_merged_cells(file_path):
    """
    处理合并单元格的读取
    """
    # 使用openpyxl读取合并单元格信息
    wb = load_workbook(filename=file_path)
    ws = wb.active

    # 获取所有合并单元格范围
    merged_ranges = ws.merged_cells.ranges

    # 创建数据填充字典
    fill_dict = {}

    for merged_range in merged_ranges:
        min_row, min_col, max_row, max_col = merged_range.min_row, merged_range.min_col, merged_range.max_row, merged_range.max_col
        value = ws.cell(row=min_row, column=min_col).value

        # 为合并区域的所有单元格填充相同值
        for row in range(min_row, max_row + 1):
            for col in range(min_col, max_col + 1):
                fill_dict[(row, col)] = value

    # 读取数据并填充合并单元格
    data = []
    for row in ws.iter_rows(values_only=True):
        data.append(list(row))

    df = pd.DataFrame(data)

    # 如果需要，可以使用fill_dict进一步处理
    return df, fill_dict

4. 读取特定区域

import pandas as pd

def read_specific_range(file_path):
    """
    读取Excel中的特定区域
    """
    # 方法1：使用usecols和skiprows
    df_range1 = pd.read_excel(
        file_path,
        usecols='B:D,G:I',  # 读取B-D列和G-I列
        skiprows=3,         # 跳过前3行
        nrows=50           # 读取50行
    )

    # 方法2：使用openpyxl读取后再转换
    from openpyxl import load_workbook

    wb = load_workbook(filename=file_path, data_only=True)
    ws = wb.active

    # 读取A1:C10区域
    range_data = []
    for row in ws.iter_rows(min_row=1, min_col=1, 
                           max_row=10, max_col=3,
                           values_only=True):
        range_data.append(row)

    df_range2 = pd.DataFrame(range_data)

    return df_range1, df_range2

常见问题

问题1：编码问题

# 处理编码问题
try:
    df = pd.read_excel('data.xlsx', encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_excel('data.xlsx', encoding='gbk')

问题2：日期格式处理

# 指定日期列并格式化
df = pd.read_excel('data.xlsx', 
                   parse_dates=['出生日期', '入职日期'],
                   date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'))

问题3：性能优化

# 优化读取性能
df = pd.read_excel('large_file.xlsx',
                   engine='openpyxl',
                   dtype={'ID': 'int32', 'Score': 'float32'},
                   usecols=lambda x: 'Unnamed' not in x)  # 跳过未命名列

问题4：处理多个Excel文件

import os
import pandas as pd

def read_multiple_excel_files(folder_path):
    """
    读取文件夹中的所有Excel文件
    """
    all_data = {}

    for file_name in os.listdir(folder_path):
        if file_name.endswith(('.xlsx', '.xls')):
            file_path = os.path.join(folder_path, file_name)

            try:
                # 读取所有工作表
                excel_file = pd.ExcelFile(file_path)

                for sheet_name in excel_file.sheet_names:
                    df = excel_file.parse(sheet_name)
                    key = f"{file_name}_{sheet_name}"
                    all_data[key] = df
                    print(f"已读取: {key}, 形状: {df.shape}")

            except Exception as e:
                print(f"读取文件 {file_name} 时出错: {e}")

    return all_data

问题5：内存管理

import gc

def process_large_excel_memory_efficient(file_path):
    """
    内存高效处理大Excel文件
    """
    # 只读取需要的列
    df = pd.read_excel(file_path, 
                       usecols=['必要列1', '必要列2', '必要列3'])

    # 优化数据类型
    df['整数列'] = pd.to_numeric(df['整数列'], downcast='integer')
    df['浮点列'] = pd.to_numeric(df['浮点列'], downcast='float')
    df['文本列'] = df['文本列'].astype('category')

    # 及时释放内存
    del df
    gc.collect()

最佳实践建议

选择合适库：

新项目推荐使用pandas
需要格式信息时使用openpyxl
旧.xls文件使用xlrd

处理大文件：

使用分块读取(chunksize)
只读取需要的列(usecols)
使用合适的数据类型

错误处理：

始终使用try-except包装
检查文件是否存在
处理编码问题

性能优化：

避免在循环中读取文件
使用适当的数据类型
及时释放内存

完整示例代码

import pandas as pd
import numpy as np
from datetime import datetime

class ExcelReader:
    """Excel读取器类"""

    def __init__(self, file_path):
        self.file_path = file_path
        self.df = None

    def read_excel(self, **kwargs):
        """读取Excel文件"""
        try:
            self.df = pd.read_excel(self.file_path, **kwargs)
            print(f"成功读取文件: {self.file_path}")
            print(f"数据形状: {self.df.shape}")
            return True
        except Exception as e:
            print(f"读取失败: {e}")
            return False

    def get_basic_info(self):
        """获取基本信息"""
        if self.df is None:
            return None

        info = {
            'shape': self.df.shape,
            'columns': list(self.df.columns),
            'dtypes': self.df.dtypes.to_dict(),
            'memory_usage': self.df.memory_usage(deep=True).sum(),
            'null_counts': self.df.isnull().sum().to_dict()
        }
        return info

    def filter_data(self, conditions):
        """筛选数据"""
        if self.df is None:
            return None

        filtered_df = self.df.copy()
        for column, condition in conditions.items():
            if callable(condition):
                filtered_df = filtered_df[filtered_df[column].apply(condition)]
            else:
                filtered_df = filtered_df[filtered_df[column] == condition]

        return filtered_df

    def save_to_csv(self, output_path):
        """保存为CSV"""
        if self.df is not None:
            self.df.to_csv(output_path, index=False, encoding='utf-8-sig')
            print(f"已保存到: {output_path}")

# 使用示例
if __name__ == "__main__":
    # 创建读取器
    reader = ExcelReader('示例数据.xlsx')

    # 读取数据
    if reader.read_excel(sheet_name=0, header=0):
        # 获取基本信息
        info = reader.get_basic_info()
        print("基本信息:", info)

        # 筛选数据
        filtered = reader.filter_data({'部门': '技术部', '年龄': lambda x: x > 25})
        print(f"筛选后数据形状: {filtered.shape}")

        # 保存结果
        reader.save_to_csv('筛选结果.csv')

这个详细教学涵盖了Python读取Excel的各个方面，从基础到高级应用。根据你的具体需求选择合适的库和方法即可。