Python Tutorial - Lesson 9: Regular Expressions!

欢迎回来！🎉 这节课我们要学习正则表达式（Regular Expression）—— 一种强大的文本处理工具，让你可以轻松地查找、替换和验证文本！

复习一下前八节课

前八节课我们学会了： - 变量和基本数据类型 - 条件判断和循环 - 列表和字典 - 函数（代码的魔法师） - 文件操作（数据持久化） - 模块和包（代码组织） - 面向对象编程（OOP） - 异常和错误处理

第一个知识点：什么是正则表达式？

正则表达式的作用

正则表达式是一种用于匹配字符串模式的工具，可以： - ✅ 验证：检查字符串是否符合某种格式 - ✅ 查找：在文本中找到匹配的内容 - ✅ 替换：替换匹配的内容 - ✅ 提取：提取需要的信息

生活中的例子

第二个知识点：re模块基础

导入re模块

常用函数

1. match()：从开头匹配

场景	描述
验证邮箱	检查用户输入的邮箱格式是否正确
验证手机号	检查手机号是否是11位数字
查找网址	在文本中找到所有的URL
提取日期	从文本中提取日期信息

# match：从字符串开头匹配
pattern = r"hello"
text = "hello world"

result = re.match(pattern, text)
if result:
    print("匹配成功！")
    print("匹配的内容:", result.group())
    print("起始位置:", result.start())
    print("结束位置:", result.end())
    print("位置范围:", result.span())
else:
    print("匹配失败")

2. search()：搜索整个字符串

# search：在整个字符串中搜索
pattern = r"world"
text = "hello world hello"

result = re.search(pattern, text)
if result:
    print("找到匹配！")
    print("匹配的内容:", result.group())

3. findall()：找到所有匹配

# findall：找到所有匹配，返回列表
pattern = r"\d+"  # 匹配数字
text = "苹果10元，香蕉5元，橙子8元"

results = re.findall(pattern, text)
print("找到的数字:", results)

4. finditer()：找到所有匹配（迭代器）

# finditer：找到所有匹配，返回迭代器
pattern = r"\d+"
text = "苹果10元，香蕉5元，橙子8元"

for match in re.finditer(pattern, text):
    print(f"找到: {match.group()}, 位置: {match.span()}")

5. sub()：替换

# sub：替换匹配的内容
pattern = r"\d+"
text = "苹果10元，香蕉5元，橙子8元"

# 把数字替换成"*"
new_text = re.sub(pattern, "*", text)
print("替换后:", new_text)

# 使用函数替换
def double_num(match):
    num = int(match.group())
    return str(num * 2)

new_text2 = re.sub(pattern, double_num, text)
print("翻倍后:", new_text2)

6. split()：分割

# split：按匹配分割字符串
pattern = r"[,，]"  # 匹配逗号或中文逗号
text = "苹果,香蕉，橙子,葡萄"

parts = re.split(pattern, text)
print("分割结果:", parts)

第三个知识点：基础语法

1. 普通字符

# 普通字符直接匹配
pattern = r"python"
text = "I love Python and python"

# 忽略大小写
result = re.findall(pattern, text, re.IGNORECASE)
print("匹配结果:", result)

2. 元字符

字符	描述	示例
`.`	匹配任意字符（除了换行）	`a.b` 匹配 "aab", "acb", "a1b"
`^`	匹配字符串开头	`^hello` 匹配以hello开头的
`$`	匹配字符串结尾	`world$` 匹配以world结尾的
`*`	匹配0次或多次	`a*` 匹配 "", "a", "aa", "aaa"
`+`	匹配1次或多次	`a+` 匹配 "a", "aa", "aaa"
`?`	匹配0次或1次	`a?` 匹配 "", "a"
`{n}`	匹配n次	`a{3}` 匹配 "aaa"
`{n,}`	匹配至少n次	`a{2,}` 匹配 "aa", "aaa"
`{n,m}`	匹配n到m次	`a{2,4}` 匹配 "aa", "aaa", "aaaa"
`\`	转义字符	`\.` 匹配点号本身

# 点号：匹配任意字符
print(re.findall(r"a.b", "aab acb a1b a b"))  # ['aab', 'acb', 'a1b', 'a b']

# ^和$：开头和结尾
print(re.findall(r"^hello", "hello world"))  # ['hello']
print(re.findall(r"world$", "hello world"))  # ['world']

# *：0次或多次
print(re.findall(r"a*", "aaabaaa"))  # ['aaa', '', 'aaa', '']

# +：1次或多次
print(re.findall(r"a+", "aaabaaa"))  # ['aaa', 'aaa']

# ?：0次或1次
print(re.findall(r"a?", "aaabaaa"))  # ['a', 'a', 'a', '', 'a', 'a', 'a', '']

# {n}：精确n次
print(re.findall(r"a{3}", "aaabaaa"))  # ['aaa', 'aaa']

# {n,}：至少n次
print(re.findall(r"a{2,}", "a aa aaa aaaa"))  # ['aa', 'aaa', 'aaaa']

# {n,m}：n到m次
print(re.findall(r"a{2,3}", "a aa aaa aaaa"))  # ['aa', 'aaa', 'aaa']

3. 字符类

字符	描述
`[abc]`	匹配a、b或c
`[^abc]`	匹配除了a、b、c以外的字符
`[a-z]`	匹配小写字母
`[A-Z]`	匹配大写字母
`[0-9]`	匹配数字
`[a-zA-Z0-9]`	匹配字母和数字

# 字符类
print(re.findall(r"[aeiou]", "hello world"))  # ['e', 'o', 'o']
print(re.findall(r"[^aeiou]", "hello"))       # ['h', 'l', 'l']
print(re.findall(r"[a-z]", "Hello123"))       # ['e', 'l', 'l', 'o']
print(re.findall(r"[0-9]", "Hello123"))       # ['1', '2', '3']
print(re.findall(r"[a-zA-Z0-9]", "Hello!@#123"))  # ['H','e','l','l','o','1','2','3']

4. 预定义字符类

字符	描述
`\d`	数字，等价于 `[0-9]`
`\D`	非数字，等价于 `[^0-9]`
`\w`	单词字符，等价于 `[a-zA-Z0-9_]`
`\W`	非单词字符
`\s`	空白字符（空格、制表符、换行等）
`\S`	非空白字符

# 预定义字符类
text = "Hello 123 World!\tPython\n"

print(re.findall(r"\d", text))  # ['1', '2', '3']
print(re.findall(r"\D", text))  # 非数字
print(re.findall(r"\w", text))  # 单词字符
print(re.findall(r"\W", text))  # 非单词字符
print(re.findall(r"\s", text))  # 空白字符
print(re.findall(r"\S", text))  # 非空白字符

5. 边界匹配

字符	描述
`\b`	单词边界
`\B`	非单词边界

# 单词边界
text = "hello world helloworld"

print(re.findall(r"\bhello\b", text))  # ['hello']（只匹配独立的hello）
print(re.findall(r"hello\B", text))    # ['hello']（匹配helloworld中的hello）

第四个知识点：分组

1. 基本分组

# 使用()分组
text = "我的电话是13812345678，他的电话是13987654321"
pattern = r"(\d{3})(\d{4})(\d{4})"  # 分成三组

# findall会返回分组的元组
results = re.findall(pattern, text)
print(results)  # [('138', '1234', '5678'), ('139', '8765', '5432')]

# 使用group访问分组
match = re.search(pattern, text)
if match:
    print("完整匹配:", match.group(0))
    print("第一组:", match.group(1))
    print("第二组:", match.group(2))
    print("第三组:", match.group(3))
    print("所有组:", match.groups())

2. 命名分组

# 命名分组：(?P<name>...)
text = "日期：2024-03-15"
pattern = r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})"

match = re.search(pattern, text)
if match:
    print("年份:", match.group('year'))
    print("月份:", match.group('month'))
    print("日期:", match.group('day'))
    print("分组字典:", match.groupdict())

3. 非捕获分组

# 非捕获分组：(?:...)
text = "aab abb"
pattern = r"a(?:a|b)b"  # 分组但不捕获

results = re.findall(pattern, text)
print(results)  # ['aab', 'abb']（返回完整匹配，不是分组）

第五个知识点：常用实战案例

1. 验证邮箱

def is_valid_email(email):
    """验证邮箱格式"""
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return re.match(pattern, email) is not None

# 测试
emails = [
    "test@example.com",
    "user.name+tag@domain.co.uk",
    "invalid-email",
    "@example.com",
    "user@.com"
]

for email in emails:
    print(f"{email}: {'✓ 有效' if is_valid_email(email) else '✗ 无效'}")

2. 验证手机号

def is_valid_phone(phone):
    """验证手机号（中国大陆）"""
    pattern = r'^1[3-9]\d{9}$'
    return re.match(pattern, phone) is not None

# 测试
phones = [
    "13812345678",
    "19987654321",
    "12345678901",  # 12开头，无效
    "1381234567",   # 位数不够
    "138123456789"  # 位数太多
]

for phone in phones:
    print(f"{phone}: {'✓ 有效' if is_valid_phone(phone) else '✗ 无效'}")

3. 提取URL

def extract_urls(text):
    """从文本中提取URL"""
    pattern = r'https?://[^\s]+'
    return re.findall(pattern, text)

# 测试
text = """
访问我的网站：https://www.example.com
或者查看文档：http://docs.example.org/page.html
也可以访问：https://sub.domain.example.com/path?query=123
"""

urls = extract_urls(text)
print("找到的URL：")
for url in urls:
    print(f"  - {url}")

4. 提取日期

def extract_dates(text):
    """从文本中提取日期（YYYY-MM-DD格式）"""
    pattern = r'(\d{4})-(\d{2})-(\d{2})'
    dates = []
    for match in re.finditer(pattern, text):
        year, month, day = match.groups()
        dates.append({
            'full': match.group(),
            'year': year,
            'month': month,
            'day': day
        })
    return dates

# 测试
text = "项目从2024-01-15开始，到2024-03-31结束，中间2024-02-14有个里程碑"

dates = extract_dates(text)
print("找到的日期：")
for date in dates:
    print(f"  - {date['full']} (年:{date['year']}, 月:{date['month']}, 日:{date['day']})")

5. 替换敏感信息

def mask_sensitive_info(text):
    """替换敏感信息"""
    # 替换手机号中间4位
    text = re.sub(r'(\d{3})\d{4}(\d{4})', r'\1****\2', text)

    # 替换邮箱用户名
    text = re.sub(r'(\w)\w+(\w@)', r'\1***\2', text)

    return text

# 测试
text = "我的手机号是13812345678，邮箱是testuser@example.com"
masked = mask_sensitive_info(text)
print("原始:", text)
print("脱敏:", masked)

第六个知识点：编译正则表达式

re.compile()

# 编译正则表达式，可以重复使用
pattern = re.compile(r'\d+')

text1 = "123 abc 456"
text2 = "789 def 012"

# 使用编译后的pattern
print(pattern.findall(text1))  # ['123', '456']
print(pattern.findall(text2))  # ['789', '012']

# 指定编译标志
pattern = re.compile(r'hello', re.IGNORECASE)  # 忽略大小写
print(pattern.findall("Hello HELLO hello"))  # ['Hello', 'HELLO', 'hello']

常用标志

标志	描述
`re.IGNORECASE` 或 `re.I`	忽略大小写
`re.MULTILINE` 或 `re.M`	多行模式，^和$匹配每一行
`re.DOTALL` 或 `re.S`	点号匹配包括换行符
`re.VERBOSE` 或 `re.X`	详细模式，可以写注释

# 多行模式
text = """first line
second line
third line"""

pattern = re.compile(r'^(\w+)', re.MULTILINE)
print(pattern.findall(text))  # ['first', 'second', 'third']

# 点号匹配换行符
text = "line1\nline2"
pattern = re.compile(r'line1.line2', re.DOTALL)
print(pattern.search(text) is not None)  # True

# 详细模式
pattern = re.compile(r"""
    \d{4}    # 年份
    -        # 分隔符
    \d{2}    # 月份
    -        # 分隔符
    \d{2}    # 日期
""", re.VERBOSE)

print(pattern.match("2024-03-15") is not None)  # True

综合练习：文本分析工具

import re
from collections import Counter

class TextAnalyzer:
    def __init__(self, text):
        self.text = text

    def count_words(self):
        """统计单词数"""
        words = re.findall(r'\b\w+\b', self.text.lower())
        return len(words)

    def count_sentences(self):
        """统计句子数"""
        sentences = re.split(r'[.!?]+', self.text)
        return len([s for s in sentences if s.strip()])

    def count_characters(self, include_spaces=True):
        """统计字符数"""
        if include_spaces:
            return len(self.text)
        return len(re.sub(r'\s', '', self.text))

    def find_emails(self):
        """查找所有邮箱"""
        pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        return re.findall(pattern, self.text)

    def find_phones(self):
        """查找所有手机号"""
        pattern = r'\b1[3-9]\d{9}\b'
        return re.findall(pattern, self.text)

    def find_urls(self):
        """查找所有URL"""
        pattern = r'https?://[^\s]+'
        return re.findall(pattern, self.text)

    def find_dates(self):
        """查找所有日期"""
        pattern = r'\b(\d{4})-(\d{2})-(\d{2})\b'
        dates = []
        for match in re.finditer(pattern, self.text):
            dates.append({
                'full': match.group(),
                'year': match.group(1),
                'month': match.group(2),
                'day': match.group(3)
            })
        return dates

    def word_frequency(self, top_n=10):
        """词频统计"""
        words = re.findall(r'\b\w+\b', self.text.lower())
        counter = Counter(words)
        return counter.most_common(top_n)

    def extract_quotations(self):
        """提取引号中的内容"""
        pattern = r'"([^"]+)"|\'([^\']+)\''
        quotations = []
        for match in re.finditer(pattern, self.text):
            quotation = match.group(1) or match.group(2)
            if quotation:
                quotations.append(quotation)
        return quotations

    def generate_report(self):
        """生成分析报告"""
        print("="*60)
        print("  📊 文本分析报告")
        print("="*60)
        print(f"\n📝 基本统计：")
        print(f"  字符数（含空格）：{self.count_characters(True)}")
        print(f"  字符数（不含空格）：{self.count_characters(False)}")
        print(f"  单词数：{self.count_words()}")
        print(f"  句子数：{self.count_sentences()}")

        emails = self.find_emails()
        if emails:
            print(f"\n📧 找到的邮箱（{len(emails)}个）：")
            for email in emails:
                print(f"  - {email}")

        phones = self.find_phones()
        if phones:
            print(f"\n📱 找到的手机号（{len(phones)}个）：")
            for phone in phones:
                print(f"  - {phone}")

        urls = self.find_urls()
        if urls:
            print(f"\n🌐 找到的URL（{len(urls)}个）：")
            for url in urls:
                print(f"  - {url}")

        dates = self.find_dates()
        if dates:
            print(f"\n📅 找到的日期（{len(dates)}个）：")
            for date in dates:
                print(f"  - {date['full']}")

        quotations = self.extract_quotations()
        if quotations:
            print(f"\n💬 找到的引用（{len(quotations)}个）：")
            for quote in quotations:
                print(f'  - "{quote}"')

        word_freq = self.word_frequency(10)
        if word_freq:
            print(f"\n🔝 词频前10：")
            for i, (word, count) in enumerate(word_freq, 1):
                print(f"  {i}. {word}: {count}次")

        print("\n" + "="*60)

# 测试文本
sample_text = """
这是一个测试文本。Hello World!
我的邮箱是test@example.com，另一个邮箱是admin@company.org。
手机号：13812345678，19987654321。
访问网站：https://www.example.com，http://test.org。
重要日期：2024-01-15，2024-03-15，2024-06-15。
引用内容："这是第一条引用"，'这是第二条引用'。
词频测试：hello hello world world world test test test test。
"""

# 使用文本分析器
analyzer = TextAnalyzer(sample_text)
analyzer.generate_report()

第九课小结

✅ 正则表达式的概念：强大的文本模式匹配工具
✅ re模块：match、search、findall、finditer、sub、split
✅ 基础语法：普通字符、元字符、字符类、预定义字符类
✅ 分组：基本分组、命名分组、非捕获分组
✅ 常用标志：IGNORECASE、MULTILINE、DOTALL、VERBOSE
✅ 实战案例：验证邮箱、手机号、提取URL、日期等
✅ 编译正则表达式：提高性能，重复使用

Python学习教程 - 第九课：正则表达式，文本处理的瑞士军刀！