regex 如何修改Python代码,从混合输入字符串中提取和格式化列表项,包括将每个列表项置于上下文中?

a14dhokn  于 12个月前  发布在  Python
关注(0)|答案(1)|浏览(91)

验证码:

import textwrap

def filter_lists(text):
    # split text into lines
    lines = text.split('\n')

    patterns = [
        r'^[0-9]+\.',  # number list item
        r'^[a-zA-Z]\.',  # letter list item
        r'^\u2022',  # bullet point list item
        r'^[ivx]+\.',  # roman numeral list item
        r'^\u25E6',  # special bullet point list item
        r'^\u2713',  # Checkmark List Item
        r'^[→←↑↓]',  # Arrow List Item (Add arrows as needed)
        r'^♦',  # Diamond List Item
        r'^★',  # Star List Item
        r'^[^\w\s]',  # Emoji List Item (Matches any non-word, non-space character)
        r'^\uE000',  # Icon List Item (Replace with the specific Unicode code for your icon)
        r'^[@#*%!&]',  # Custom Symbol List Item (Add your custom symbols within the brackets)
        r'^(red|blue|green|yellow)',  # Color-Coded List Item (Add color names or codes)
        r'^\d+\.(jpg|png|gif)',  # Image List Item (Matches numbered image file names)
        r'^\[\d{1,3}%\]',  # Progress Bar List Item (Matches percentages in square brackets)
        r'^\[[A-Za-z]+\]',  # Tag or Label List Item (Matches words in square brackets)
        r'^\d+⚫',  # Numbered Icon List Item (Matches numbers followed by a black circle)
        r'^"([^"]+)"',  # Quote List Item (Matches text enclosed in double quotes)
        r'^\d{8}',  # Barcode List Item (Matches 8-digit numbers, adjust as needed)
    ]
    # initialize list for filtered lines
    filtered_lines = []

    # iterate over lines
    for line in lines:
        # iterate over patterns
        for pattern in patterns:
            # if line matches pattern, add to filtered lines
            if re.match(pattern, line.strip()):
                filtered_lines.append(line)
                break

    return '\n'.join(filtered_lines)
import hashlib
import re

def process_string(input_string, prefix_format=None, hierarchy=None):
    output_string = ''
    used_prefixes = {}
    stack = []
    level = 0
    if prefix_format is None:
        first_line = input_string.split('\n')[0]
        prefix_format = infer_prefix_format(first_line)
    if not isinstance(prefix_format, list):
        prefix_format = [prefix_format]
    if hierarchy is None:
        hierarchy = infer_hierarchy(input_string)
    if not isinstance(hierarchy, list):
        hierarchy = [hierarchy]
    input_string = textwrap.dedent(input_string)
    input_string = '\n'.join(textwrap.wrap(input_string, width=80))
    lines = input_string.split('\n')
    if not input_string or input_string.isspace():
        return output_string
    for line in lines:
        line = line.strip()
        if not line:
            output_string += '\n'
            used_prefixes = {k: v for k, v in used_prefixes.items() if k in stack[:level]}
            stack = stack[:level]
            continue
        current_level = len(re.match(r'^(\s+)', line).group(1)) // 4 if re.match(r'^(\s+)', line) else 0
        if current_level > level:
            level += 1
            if len(prefix_format) >= level:
                format_dict = prefix_format[level - 1]
                validate_prefix_format(format_dict)
            else:
                raise ValueError(f"Missing prefix format for level {level}")
            if len(hierarchy) >= level:
                preference_list = hierarchy[level - 1]
                validate_hierarchy(preference_list)
            else:
                raise ValueError(f"Missing hierarchy for level {level}")
        elif current_level < level:
            while current_level < level and stack:
                stack.pop()
                used_prefixes.popitem()
                level -= 1
        else:
            format_dict = prefix_format[level - 1]
            preference_list = hierarchy[level - 1]
        try:
            match = re.match(r'(\d+\.|\w+\.|\w+\-|\w+\:|\S+)(\s+)(.+)?', line)
            if match:
                prefix = match.group(1)
                separator = match.group(2)
                content = match.group(3) or 'N/A'
            else:
                prefix = ''
                separator = ''
                content = ''
            prefix, separator, content, different_level = normalize_prefix(prefix, stack, used_prefixes=used_prefixes, separator=separator, mode='lenient', hierarchy=preference_list)
        except ValueError as e:
            print(e)
            continue
        else:
            if not content or content.isspace():
                content = 'N/A'
            output_string += format_output(prefix, separator, content, level, format_dict, preference_list)
            if different_level:
                stack.append(prefix)
    return output_string
def infer_hierarchy(input_string):
    """
    This function infers the hierarchy from the input string by using a regular expression to extract the prefixes at different levels of indentation from the input string,
    and creates a list based on them.
    """
    inferred_hierarchy = []
    lines = input_string.split('\n')
    for line in lines:
        match = re.match(r'^(\s*)(\d+\.|\w+\.|\w+\-|\w+\:|\S+)', line)
        if match:
            indentation = len(match.group(1))
            prefix = match.group(2)
            if indentation < len(inferred_hierarchy):
                preference_list = inferred_hierarchy[indentation]
                if prefix.isdigit() and 'number' not in preference_list:
                    preference_list.append('number')
                elif prefix.isalpha() and 'letter' not in preference_list:
                    preference_list.append('letter')
                elif not prefix.isdigit() and not prefix.isalpha() and 'other' not in preference_list:
                    preference_list.append('other')
            else:
                preference_list = []
                if prefix.isdigit():
                    preference_list.append('number')
                elif prefix.isalpha():
                    preference_list.append('letter')
                else:
                    preference_list.append('other')
                inferred_hierarchy.append(preference_list)
    return inferred_hierarchy
def validate_hierarchy(preference_list):
    """This function validates a given preference list and raises an exception if it is invalid.
    A valid preference list must be a list of strings that contain only 'number', 'letter', or 'other',
    and must have at least one element.
    """
    if not isinstance(preference_list, list):
        raise ValueError("Preference list must be a list")
    if not preference_list:
        raise ValueError("Preference list cannot be empty")
    for element in preference_list:
        if not isinstance(element, str):
            raise ValueError("Preference list must contain only strings")
        if not (element == 'number' or element == 'letter' or element == 'other'):
            raise ValueError("Preference list must contain only 'number', 'letter', or 'other'")
def format_output(prefix, separator, content, level, format_dict, hierarchy):
    """
    This function formats the output string with proper indentation and formatting based on
    - The prefix type and format.
    - The separator.
    - The hierarchy list that specifies the order of preference for different types of prefixes at each level of indentation.
    """
    output_string = ''
    output_string += ' ' * (level * 4)
    if hierarchy:
        if level < len(hierarchy):
            preference = hierarchy[level]
            if prefix.isdigit():
                output_string += prefix + preference[0] + ' '
            elif prefix.isalpha():
                output_string += prefix + preference[1] + ' '
            else:
                output_string += preference[2].format(prefix) + ' '
        else:
            if prefix.isdigit():
                output_string += prefix + format_dict['number'] + ' '
            elif prefix.isalpha():
                output_string += prefix + format_dict['letter'] + ' '
            else:
                output_string += format_dict['other'].format(prefix) + ' '
    else:
        if prefix.isdigit():
            output_string += prefix + format_dict['number'] + ' '
        elif prefix.isalpha():
            output_string += prefix + format_dict['letter'] + ' '
        else:
            output_string += format_dict['other'].format(prefix) + ' '
    output_string += separator
    output_string += content + '\n'
    return output_string
def validate_prefix_format(format_dict):
    """This function validates a given prefix format dictionary and raises an exception if it is invalid.
    A valid prefix format dictionary must have keys for numbers, letters, and other types of prefixes,
    and values that are valid separators or formats.
    """
    # Check if the format_dict has keys for numbers, letters, and other types of prefixes
    if not ('number' in format_dict and 'letter' in format_dict and 'other' in format_dict):
        raise ValueError("Prefix format dictionary must have keys for numbers, letters, and other types of prefixes")
    # Check if the format_dict has values that are valid separators or formats
    for key, value in format_dict.items():
        # If the value is a list or not a string, use the first element as the value
        if isinstance(value, list) or not isinstance(value, str):
            value = value[0]
        # If the value is not a dot, a dash, or a placeholder for other types of prefixes, raise an exception
        if not (value == '.' or value == '-' or re.match(r'\(\{\}\)', value)):
            raise ValueError(f"Prefix format dictionary must have a valid separator or format for {key} type of prefix")
        # If the key is 'l', add a key for lowercase letters with the same value
        if key == 'l':
            format_dict['letter'] = value

def normalize_prefix(prefix, previous_prefixes, used_prefixes={}, separator='.', mode='lenient', hierarchy=None):
    """
    This function normalizes a given prefix and returns it along with the separator and the content as a tuple.
    A normalized prefix is one that follows a logical sequence or hierarchy based on the previous prefixes at the same level,
    does not contain both numbers and letters, is a valid alphanumeric character or a symbol, and is not repeated at different levels of indentation.
    The hierarchy argument specifies the order of preference for different types of prefixes at each level of indentation.
    For example, hierarchy = [['number', 'letter', 'other'], ['letter', 'number', 'other']] means that at level 1, numbers are preferred over letters and other symbols,
    and at level 2, letters are preferred over numbers and other symbols. If hierarchy is None, then it can be inferred from the input string or assigned a default value.
    """
    format_dict = {'number': '.', 'letter': '-', 'other': '({})'}
    hashed_prefix = hashlib.md5(prefix.encode()).hexdigest()
    # split prefix into number and letter parts
    match = re.match(r'^(\d+)(\w+)', prefix)
    if match:
        number_part = match.group(1)
        letter_part = match.group(2)
        # normalize number part
        number_part, number_separator, _, _ = normalize_prefix(number_part, previous_prefixes, used_prefixes=used_prefixes, separator=format_dict['number'], mode=mode, hierarchy=hierarchy)
        # normalize letter part
        letter_part, letter_separator, _, _ = normalize_prefix(letter_part, previous_prefixes, used_prefixes=used_prefixes, separator=format_dict['letter'], mode=mode, hierarchy=hierarchy)
        # combine number and letter parts
        content = prefix.replace(number_part, '').replace(letter_part, '')
        return (number_part + letter_part, format_dict['number'] if number_part.isdigit() else format_dict['letter'], content, False)
    # use a list of symbols
    symbols = ['•', '◦', '▪', '▫', '▸', '◂', '▴', '▾']
    # check if prefix is a symbol
    if prefix in symbols:
        # normalize symbol according to its position in the list
        index = symbols.index(prefix)
        sorted_symbols = sorted([p for p in previous_prefixes if p in symbols], key=symbols.index)
        new_index = sorted_symbols.index(prefix) if prefix in sorted_symbols else len(sorted_symbols)
        new_prefix = symbols[new_index]
        content = prefix[1:]
        return (new_prefix, format_dict['other'].format(prefix), content, False)
    match = re.match(r'^(\w|\S)(\W+)(.+)', prefix)
    if match:
        prefix = match.group(1)
        separator = match.group(2)
        content = match.group(3)
    else:
        match = re.match(r'^(\w|\S)(.+)', prefix)
        if match:
            prefix = match.group(1)
            separator = ''
            content = match.group(2)
        else:
            match = re.match(r'^(\w|\S)', prefix)
            if match:
                prefix = match.group(1)
                separator = ''
                content = ''
            else:
                prefix = ''
                separator = ''
                content = ''
    # handle empty or whitespace prefixes by assigning a default prefix based on the hierarchy
    if not prefix or prefix.isspace():
        level = len(previous_prefixes)
        if level < len(hierarchy):
            preference_list = hierarchy[level]
            first_type = preference_list[0]
            if first_type == 'number':
                sorted_number_prefixes = sorted([p for p in previous_prefixes if p.isdigit()], key=int)
                new_prefix = str(int(sorted_number_prefixes[-1]) + 1) if sorted_number_prefixes else '1'
            elif first_type == 'letter':
                sorted_letter_prefixes = sorted([p for p in previous_prefixes if p.isalpha()])
                new_prefix = chr(ord(sorted_letter_prefixes[-1]) + 1) if sorted_letter_prefixes else 'A'
            else:
                symbols = ['•', '◦', '▪', '▫', '▸', '◂', '▴', '▾']
                sorted_symbol_prefixes = sorted([p for p in previous_prefixes if p in symbols], key=symbols.index)
                new_prefix = symbols[symbols.index(sorted_symbol_prefixes[-1]) + 1] if sorted_symbol_prefixes else symbols[0]
            return (new_prefix, format_dict[first_type], content, True)
        else:
            return ('*', format_dict['other'].format('*'), content, True)
    # check if prefix contains both numbers and letters
    if re.match(r'\d+\w+|\w+\d+', prefix):
        if mode == 'strict':
            raise ValueError(f"Invalid prefix: {prefix}")
        else:
            unique_prefix = prefix + str(used_prefixes.get(prefix, 0))
            used_prefixes[prefix] = used_prefixes.get(prefix, 0) + 1
            return (unique_prefix, format_dict['other'].format(prefix), content, False)
    # check if prefix is a valid alphanumeric character or a symbol
    if not re.match(r'\w|\S', prefix):
        if mode == 'strict':
            raise ValueError(f"Invalid prefix: {prefix}")
        else:
            return (hashed_prefix, format_dict['other'].format(prefix), content, False)
    # check if there are previous prefixes at the same level
    if previous_prefixes:
        last_prefix = previous_prefixes[-1]
        # check if prefix has the same type as the last prefix
        if (prefix.isdigit() == last_prefix.isdigit()) and (prefix.isalpha() == last_prefix.isalpha()):
            # check if prefix is a number
            if prefix.isdigit():
                # check if prefix has one digit
                if len(prefix) == 1:
                    if mode == 'strict':
                        raise ValueError(f"Inconsistent prefix: {prefix}")
                    else:
                        # increment the prefix by one
                        new_prefix = str(int(prefix) + 1)
                        return (new_prefix, format_dict['number'], content, False)
                # check if prefix is less than or equal to the last prefix
                elif int(prefix) <= int(last_prefix):
                    if mode == 'strict':
                        raise ValueError(f"Inconsistent prefix: {prefix}")
                    else:
                        # sort the prefixes and find the index of the current prefix
                        sorted_prefixes = sorted(previous_prefixes + [prefix], key=int)
                        index = sorted_prefixes.index(prefix)
                        # assign a new prefix based on the index
                        new_prefix = str(index + 1)
                        return (new_prefix, format_dict['number'], content, False)
                else:
                    pass
            # check if prefix is a letter
            elif prefix.isalpha():
                # check if prefix has one letter
                if len(prefix) == 1:
                    if mode == 'strict':
                        raise ValueError(f"Inconsistent prefix: {prefix}")
                    else:
                        # increment the prefix by one
                        new_prefix = chr(ord(prefix) + 1)
                        return (new_prefix, format_dict['letter'], content, False)
                # check if prefix is less than or equal to the last prefix
                elif ord(prefix) <= ord(last_prefix):
                    if mode == 'strict':
                        raise ValueError(f"Inconsistent prefix: {prefix}")
                    else:
                        # sort the prefixes and find the index of the current prefix
                        sorted_prefixes = sorted(previous_prefixes + [prefix])
                        index = sorted_prefixes.index(prefix)
                        # assign a new prefix based on the index
                        new_prefix = chr(ord('A') + index)
                        return (new_prefix, format_dict['letter'], content, False)
                else:
                    pass
            else:
                pass
    else:
        # check if there is a hierarchy for the current level
        if hierarchy:
            level = len(previous_prefixes)
            # check if the hierarchy list has an element for the current level
            if level < len(hierarchy):
                preference_list = hierarchy[level]
                # check if the prefix type matches the first preference for the current level
                if (prefix.isdigit() and preference_list[0] == 'number') or (prefix.isalpha() and preference_list[0] == 'letter') or (not prefix.isdigit() and not prefix.isalpha() and preference_list[0] == 'other'):
                    pass
                else:
                    # handle inconsistent prefixes according to the mode argument
                    if mode == 'strict':
                        raise ValueError(f"Inconsistent prefix: {prefix}")
                    else:
                        # assign a new prefix based on the first preference for the current level
                        first_type = preference_list[0]
                        if first_type == 'number':
                            sorted_number_prefixes = sorted([p for p in previous_prefixes if p.isdigit()], key=int)
                            new_prefix = str(int(sorted_number_prefixes[-1]) + 1) if sorted_number_prefixes else '1'
                        elif first_type == 'letter':
                            sorted_letter_prefixes = sorted([p for p in previous_prefixes if p.isalpha()])
                            new_prefix = chr(ord(sorted_letter_prefixes[-1]) + 1) if sorted_letter_prefixes else 'A'
                        else:
                            symbols = ['•', '◦', '▪', '▫', '▸', '◂', '▴', '▾']
                            sorted_symbol_prefixes = sorted([p for p in previous_prefixes if p in symbols], key=symbols.index)
                            new_prefix = symbols[symbols.index(sorted_symbol_prefixes[-1]) + 1] if sorted_symbol_prefixes else symbols[0]
                        return (new_prefix, format_dict['other'], content, False)
            else:
                return (hashed_prefix, format_dict['other'].format(prefix), content, False)
def infer_prefix_format(input_string):
    """
    This function infers the prefix format from the input string by using a regular expression to extract the prefixes and their separators from the first line of the input string,
    and creates a dictionary based on them.
    """
    inferred_format = []
    first_line = input_string.split('\n')[0]
    matches = re.findall(r'(\w|\S)(\W+)', first_line)
    for match in matches:
        prefix = match[0]
        separator = match[1]
        if prefix.isdigit():
            format_dict = {'number': separator}
        elif prefix.isalpha():
            format_dict = {'letter': separator}
        elif prefix == 'o': # added this line to handle the KeyError: 'o'
            format_dict = {'o': separator} # added this line to handle the KeyError: 'o'
        else:
            format_dict = {'other': '({})'}
        inferred_format.append(format_dict)
    return inferred_format

testData = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent hendrerit dapibus posuere. Suspendisse tempus, mi rhoncus gravida pulvinar, orci orci congue diam, vel malesuada felis elit sit amet nisl. Aliquam sagittis facilisis rhoncus. \n\n1. England Cities:\n- Liverpool\n- London\n- Huyton\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\nB. Spain Cities: \na. Place1 \nb. Place2 \nc. Place3\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\nIII. Germany Cities: \ni. Place1 \nii. Place2 \niii. Place3\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\n1. England Cities:\n 1. Liverpool\n 2. London\n 3. Huyton\n \nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\n• Spain Cities:\n◦ Place1\n◦ Place2\n◦ Place3\n\na. Germany\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\na. England\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent hendrerit dapibus posuere. Suspendisse tempus, mi rhoncus gravida pulvinar, orci orci congue diam, vel malesuada felis elit sit amet nisl. Aliquam sagittis facilisis rhoncus."
output1 = filter_lists(testData)
print(output1)
output2 = process_string(output1)
print("The output is: "+output2)

代码用途:

我创建了一个Python函数,它应该接受一个字符串作为输入。此字符串将包含一系列列表沿着非列表项。代码应该只输出列表数据,使用每个列表顶部的项目来上下文化每个列表项目的打印,并删除与列表无关的非列表项目。下面提供了输入和预期输出的示例。

代码问题:

我只能从输入中提取列表项,但到目前为止,我还不能以前面描述的格式输出它们(如下面预期的输出所示)。

输入:

"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent hendrerit dapibus posuere. Suspendisse tempus, mi rhoncus gravida pulvinar, orci orci congue diam, vel malesuada felis elit sit amet nisl. Aliquam sagittis facilisis rhoncus. \n\n1. England Cities:\n- Liverpool\n- London\n- Huyton\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\nB. Spain Cities: \na. Place1 \nb. Place2 \nc. Place3\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\nIII. Germany Cities: \ni. Place1 \nii. Place2 \niii. Place3\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\n1. England Cities:\n 1. Liverpool\n 2. London\n 3. Huyton\n \nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\n• Spain Cities:\n◦ Place1\n◦ Place2\n◦ Place3\n\na. Germany\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\na. England\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent hendrerit dapibus posuere. Suspendisse tempus, mi rhoncus gravida pulvinar, orci orci congue diam, vel malesuada felis elit sit amet nisl. Aliquam sagittis facilisis rhoncus."

预期输出:

['England Cities: Liverpool', 'England Cities: London', 'England Cities: Huyton', 'Spain Cities: Place1', 'Spain Cities: Place2', 'Spain Cities: Place3', 'Germany Cities: Place1', 'Germany Cities: Place2', 'Germany Cities: Place3', 'England Cities: Liverpool', 'England Cities: London', 'England Cities: Huyton', 'Spain Cities: Place1', 'Spain Cities: Place2', 'Spain Cities: Place3', 'Germany', 'England']

实际输出:

1. England Cities:
- Liverpool
- London
- Huyton
B. Spain Cities: 
a. Place1 
b. Place2 
c. Place3
i. Place1 
ii. Place2 
iii. Place3
1. England Cities:
 1. Liverpool
 2. London
 3. Huyton
• Spain Cities:
◦ Place1
◦ Place2
◦ Place3
a. Germany
a. England

h ({}).
h ({})lace2
h ({})iverpool
h ({})ermany

如何修改代码才能使其正常工作?

tuwxkamq

tuwxkamq1#

一个建议:

import re

s = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent hendrerit dapibus posuere. Suspendisse tempus, mi rhoncus gravida pulvinar, orci orci congue diam, vel malesuada felis elit sit amet nisl. Aliquam sagittis facilisis rhoncus. \n\n1. England Cities:\n- Liverpool\n- London\n- Huyton\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\nB. Spain Cities: \na. Barcelona \nb. Córdoba \nc. Valladolid\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\nIII. Germany Cities: \ni. Köln \nii. Hambourg \niii. Aachen\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\n1. England Cities:\n 1. Cardiff\n 2. Bristol\n 3. Coventry\n \nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\n• Spain Cities:\n◦ Toledo\n◦ Sevilla\n◦ Zaragoza\n\na. Germany\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit.\n\na. England\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent hendrerit dapibus posuere. Suspendisse tempus, mi rhoncus gravida pulvinar, orci orci congue diam, vel malesuada felis elit sit amet nisl. Aliquam sagittis facilisis rhoncus."

pList = re.compile(r'^\w{0,4}[-•◦.] +(?P<title>.*)(?:: *\n(?P<items>(?:.*$\n?)+?)(?: *\n|\Z)| *\n\n)', re.M)
pItem = re.compile(r'^ *\S+ +(?P<item>.*\S)', re.M)

result = []

for m in pList.finditer(s):
    if m.group('items'):
        for i in pItem.finditer(m.group('items')):
            result.append(f"{m.group('title')}: {i.group('item')}")
    else:
        result.append(m.group('title'))
    
print(result)

在这里,我选择不将文本拆分为行。列表项(或孤立列表标题)由两项标识:结尾处的双换行符序列和以“块”开头处的符号结束的序列。
请注意,对项目start的描述实际上是幼稚的:\w{0,4}[-•◦.]用于标题,\S+用于项目,但请随意编写更精确的内容。此外,如果您捕获这些部分并对第二个模式使用re.findall方法,那么您将能够像在初始代码中那样执行更多检查。

相关问题