你如何将这个正则expression式成语从Perl翻译成Python?
大约一年前我从Perl切换到Python,并没有回头。 只有一个成语,我发现我可以在Perl比Python更容易做到:
if ($var =~ /foo(.+)/) { # do something with $1 } elsif ($var =~ /bar(.+)/) { # do something with $1 } elsif ($var =~ /baz(.+)/) { # do something with $1 }
相应的Python代码不是很优雅,因为if语句不断变得嵌套:
m = re.search(r'foo(.+)', var) if m: # do something with m.group(1) else: m = re.search(r'bar(.+)', var) if m: # do something with m.group(1) else: m = re.search(r'baz(.+)', var) if m: # do something with m.group(2)
有没有人有一个优雅的方式来重现这种模式在Python中? 我已经看到使用匿名函数调度表,但是对于less数正则expression式来说,这些表对我来说似乎不太好用。
感谢这个其他的SO问题 :
import re class DataHolder: def __init__(self, value=None, attr_name='value'): self._attr_name = attr_name self.set(value) def __call__(self, value): return self.set(value) def set(self, value): setattr(self, self._attr_name, value) return value def get(self): return getattr(self, self._attr_name) string = u'test bar 123' save_match = DataHolder(attr_name='match') if save_match(re.search('foo (\d+)', string)): print "Foo" print save_match.match.group(1) elif save_match(re.search('bar (\d+)', string)): print "Bar" print save_match.match.group(1) elif save_match(re.search('baz (\d+)', string)): print "Baz" print save_match.match.group(1)
使用命名组和派遣表:
r = re.compile(r'(?P<cmd>foo|bar|baz)(?P<data>.+)') def do_foo(data): ... def do_bar(data): ... def do_baz(data): ... dispatch = { 'foo': do_foo, 'bar': do_bar, 'baz': do_baz, } m = r.match(var) if m: dispatch[m.group('cmd')](m.group('data'))
通过一点反思,你可以自动生成正则expression式和调度表。
是的,这有点烦人。 也许这将适用于你的情况。
import re class ReCheck(object): def __init__(self): self.result = None def check(self, pattern, text): self.result = re.search(pattern, text) return self.result var = 'bar stuff' m = ReCheck() if m.check(r'foo(.+)',var): print m.result.group(1) elif m.check(r'bar(.+)',var): print m.result.group(1) elif m.check(r'baz(.+)',var): print m.result.group(1)
编辑:布莱恩正确地指出,我的第一次尝试没有奏效。 不幸的是,这种尝试更长。
r""" This is an extension of the re module. It stores the last successful match object and lets you access it's methods and attributes via this module. This module exports the following additional functions: expand Return the string obtained by doing backslash substitution on a template string. group Returns one or more subgroups of the match. groups Return a tuple containing all the subgroups of the match. start Return the indices of the start of the substring matched by group. end Return the indices of the end of the substring matched by group. span Returns a 2-tuple of (start(), end()) of the substring matched by group. This module defines the following additional public attributes: pos The value of pos which was passed to the search() or match() method. endpos The value of endpos which was passed to the search() or match() method. lastindex The integer index of the last matched capturing group. lastgroup The name of the last matched capturing group. re The regular expression object which as passed to search() or match(). string The string passed to match() or search(). """ import re as re_ from re import * from functools import wraps __all__ = re_.__all__ + [ "expand", "group", "groups", "start", "end", "span", "last_match", "pos", "endpos", "lastindex", "lastgroup", "re", "string" ] last_match = pos = endpos = lastindex = lastgroup = re = string = None def _set_match(match=None): global last_match, pos, endpos, lastindex, lastgroup, re, string if match is not None: last_match = match pos = match.pos endpos = match.endpos lastindex = match.lastindex lastgroup = match.lastgroup re = match.re string = match.string return match @wraps(re_.match) def match(pattern, string, flags=0): return _set_match(re_.match(pattern, string, flags)) @wraps(re_.search) def search(pattern, string, flags=0): return _set_match(re_.search(pattern, string, flags)) @wraps(re_.findall) def findall(pattern, string, flags=0): matches = re_.findall(pattern, string, flags) if matches: _set_match(matches[-1]) return matches @wraps(re_.finditer) def finditer(pattern, string, flags=0): for match in re_.finditer(pattern, string, flags): yield _set_match(match) def expand(template): if last_match is None: raise TypeError, "No successful match yet." return last_match.expand(template) def group(*indices): if last_match is None: raise TypeError, "No successful match yet." return last_match.group(*indices) def groups(default=None): if last_match is None: raise TypeError, "No successful match yet." return last_match.groups(default) def groupdict(default=None): if last_match is None: raise TypeError, "No successful match yet." return last_match.groupdict(default) def start(group=0): if last_match is None: raise TypeError, "No successful match yet." return last_match.start(group) def end(group=0): if last_match is None: raise TypeError, "No successful match yet." return last_match.end(group) def span(group=0): if last_match is None: raise TypeError, "No successful match yet." return last_match.span(group) del wraps # Not needed past module compilation
例如:
if gre.match("foo(.+)", var): # do something with gre.group(1) elif gre.match("bar(.+)", var): # do something with gre.group(1) elif gre.match("baz(.+)", var): # do something with gre.group(1)
我build议这个,因为它使用最less的正则expression式来实现你的目标。 它仍然是function性的代码,但不会更糟,那么你的旧Perl。
import re var = "barbazfoo" m = re.search(r'(foo|bar|baz)(.+)', var) if m.group(1) == 'foo': print m.group(1) # do something with m.group(1) elif m.group(1) == "bar": print m.group(1) # do something with m.group(1) elif m.group(1) == "baz": print m.group(2) # do something with m.group(2)
或者,根本不使用正则expression式的东西:
prefix, data = var[:3], var[3:] if prefix == 'foo': # do something with data elif prefix == 'bar': # do something with data elif prefix == 'baz': # do something with data else: # do something with var
这是否合适取决于您的实际问题。 不要忘记,正则expression式不是他们在Perl中的瑞士军刀; Python有不同的string操作结构。
def find_first_match(string, *regexes): for regex, handler in regexes: m = re.search(regex, string): if m: handler(m) return else: raise ValueError find_first_match( foo, (r'foo(.+)', handle_foo), (r'bar(.+)', handle_bar), (r'baz(.+)', handle_baz))
为了加快速度,可以将所有的正则expression式转换为一个正则expression式,并即时创build调度器。 理想情况下,这将会变成一个class级。
这是我解决这个问题的方法:
matched = False; m = re.match("regex1"); if not matched and m: #do something matched = True; m = re.match("regex2"); if not matched and m: #do something else matched = True; m = re.match("regex3"); if not matched and m: #do yet something else matched = True;
几乎没有原来的模式干净。 但是,它很简单,直接,不需要额外的模块,或者你改变原来的正则expression式。
如何使用字典?
match_objects = {} if match_objects.setdefault( 'mo_foo', re_foo.search( text ) ): # do something with match_objects[ 'mo_foo' ] elif match_objects.setdefault( 'mo_bar', re_bar.search( text ) ): # do something with match_objects[ 'mo_bar' ] elif match_objects.setdefault( 'mo_baz', re_baz.search( text ) ): # do something with match_objects[ 'mo_baz' ] ...
但是,必须确保没有重复的match_objects字典键(mo_foo,mo_bar,…),最好是给每个正则expression式分配自己的名字并相应地命名match_objects键,否则match_objects.setdefault()方法将返回现有的匹配对象而不是通过运行re_xxx.search(文本)来创build新的匹配对象。
简约的DataHolder:
class Holder(object): def __call__(self, *x): if x: self.x = x[0] return self.x data = Holder() if data(re.search('foo (\d+)', string)): print data().group(1)
或者作为一个单例函数:
def data(*x): if x: data.x = x[0] return data.x
Pat Notz扩展了这个解决scheme,我发现它更加优雅:
– re
提供相同的方法(例如search()
和check()
)和
– 在holder对象本身上实现像group()
这样的必要的方法:
class Re(object): def __init__(self): self.result = None def search(self, pattern, text): self.result = re.search(pattern, text) return self.result def group(self, index): return self.result.group(index)
例
而不是这样的:
m = re.search(r'set ([^ ]+) to ([^ ]+)', line) if m: vars[m.group(1)] = m.group(2) else: m = re.search(r'print ([^ ]+)', line) if m: print(vars[m.group(1)]) else: m = re.search(r'add ([^ ]+) to ([^ ]+)', line) if m: vars[m.group(2)] += vars[m.group(1)]
一个是这样做的:
m = Re() ... if m.search(r'set ([^ ]+) to ([^ ]+)', line): vars[m.group(1)] = m.group(2) elif m.search(r'print ([^ ]+)', line): print(vars[m.group(1)]) elif m.search(r'add ([^ ]+) to ([^ ]+)', line): vars[m.group(2)] += vars[m.group(1)]
看起来非常自然,从Perl移动时不需要太多的代码更改,并像其他解决scheme一样避免了全局状态的问题。
我的解决scheme是:
import re class Found(Exception): pass try: for m in re.finditer('bar(.+)', var): # Do something raise Found for m in re.finditer('foo(.+)', var): # Do something else raise Found except Found: pass
这里是一个RegexDispatcher类,它通过正则expression式来调度它的子类方法。
每个可分派的方法用正则expression式进行注释,例如
def plus(self, regex: r"\+", **kwargs): ...
在这种情况下,注释称为“正则expression式”,它的值是匹配的正则expression式“+”,即+符号。 这些带注释的方法放在子类中,而不是放在基类中。
当在一个string上调用dispatch(…)方法时,该类将find与该string匹配的注释正则expression式并调用它的方法。 这是class级:
import inspect import re class RegexMethod: def __init__(self, method, annotation): self.method = method self.name = self.method.__name__ self.order = inspect.getsourcelines(self.method)[1] # The line in the source file self.regex = self.method.__annotations__[annotation] def match(self, s): return re.match(self.regex, s) # Make it callable def __call__(self, *args, **kwargs): return self.method(*args, **kwargs) def __str__(self): return str.format("Line: %s, method name: %s, regex: %s" % (self.order, self.name, self.regex)) class RegexDispatcher: def __init__(self, annotation="regex"): self.annotation = annotation # Collect all the methods that have an annotation that matches self.annotation # For example, methods that have the annotation "regex", which is the default self.dispatchMethods = [RegexMethod(m[1], self.annotation) for m in inspect.getmembers(self, predicate=inspect.ismethod) if (self.annotation in m[1].__annotations__)] # Be sure to process the dispatch methods in the order they appear in the class! # This is because the order in which you test regexes is important. # The most specific patterns must always be tested BEFORE more general ones # otherwise they will never match. self.dispatchMethods.sort(key=lambda m: m.order) # Finds the FIRST match of s against a RegexMethod in dispatchMethods, calls the RegexMethod and returns def dispatch(self, s, **kwargs): for m in self.dispatchMethods: if m.match(s): return m(self.annotation, **kwargs) return None
要使用这个类,可以通过子类来创build一个具有注释方法的类。 举个例子,这是一个简单的RPNCalculator,它inheritance了RegexDispatcher。 被调度的方法当然是那些带有“正则expression式”标注的方法。 调用父调度()方法。
from RegexDispatcher import * import math class RPNCalculator(RegexDispatcher): def __init__(self): RegexDispatcher.__init__(self) self.stack = [] def __str__(self): return str(self.stack) # Make RPNCalculator objects callable def __call__(self, expression): # Calculate the value of expression for t in expression.split(): self.dispatch(t, token=t) return self.top() # return the top of the stack # Stack management def top(self): return self.stack[-1] if len(self.stack) > 0 else [] def push(self, x): return self.stack.append(float(x)) def pop(self, n=1): return self.stack.pop() if n == 1 else [self.stack.pop() for n in range(n)] # Handle numbers def number(self, regex: r"[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?", **kwargs): self.stack.append(float(kwargs['token'])) # Binary operators def plus(self, regex: r"\+", **kwargs): a, b = self.pop(2) self.push(b + a) def minus(self, regex: r"\-", **kwargs): a, b = self.pop(2) self.push(b - a) def multiply(self, regex: r"\*", **kwargs): a, b = self.pop(2) self.push(b * a) def divide(self, regex: r"\/", **kwargs): a, b = self.pop(2) self.push(b / a) def pow(self, regex: r"exp", **kwargs): a, b = self.pop(2) self.push(a ** b) def logN(self, regex: r"logN", **kwargs): a, b = self.pop(2) self.push(math.log(a,b)) # Unary operators def neg(self, regex: r"neg", **kwargs): self.push(-self.pop()) def sqrt(self, regex: r"sqrt", **kwargs): self.push(math.sqrt(self.pop())) def log2(self, regex: r"log2", **kwargs): self.push(math.log2(self.pop())) def log10(self, regex: r"log10", **kwargs): self.push(math.log10(self.pop())) def pi(self, regex: r"pi", **kwargs): self.push(math.pi) def e(self, regex: r"e", **kwargs): self.push(math.e) def deg(self, regex: r"deg", **kwargs): self.push(math.degrees(self.pop())) def rad(self, regex: r"rad", **kwargs): self.push(math.radians(self.pop())) # Whole stack operators def cls(self, regex: r"c", **kwargs): self.stack=[] def sum(self, regex: r"sum", **kwargs): self.stack=[math.fsum(self.stack)] if __name__ == '__main__': calc = RPNCalculator() print(calc('2 2 exp 3 + neg')) print(calc('c 1 2 3 4 5 sum 2 * 2 / pi')) print(calc('pi 2 * deg')) print(calc('2 2 logN'))
我喜欢这个解决scheme,因为没有单独的查找表。 要匹配的正则expression式被embedded到要作为注释调用的方法中。 对我来说,这是应该的。 如果Python允许更灵活的注释,那将会很好,因为我宁愿将正则expression式注释放在方法本身上,而不是将其embedded到方法参数列表中。 但是,目前这是不可能的。
有兴趣的话,可以看看Wolfram语言,其中函数在任意模式上是多态的,而不仅仅是参数types。 在正则expression式上多态的函数是一个非常强大的想法,但我们不能在Python中干净利落。 RegexDispatcher类是我能做的最好的。