Source code for duplicity.globmatch

# -*- Mode:Python; indent-tabs-mode:nil; tab-width:4; encoding:utf-8 -*-
# Copyright 2002 Ben Escoto <>
# Copyright 2007 Kenneth Loafman <>
# Copyright 2014 Aaron Whitehouse <>
# This file is part of duplicity.
# Duplicity is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
# Duplicity is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with duplicity; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
# All functions in this module only accept unicode. Any byte strings should
# be converted to unicode before sending them into this.

import re

from builtins import map
from builtins import range
from builtins import str

[docs]class GlobbingError(Exception): u"""Something has gone wrong when parsing a glob string""" pass
[docs]class FilePrefixError(GlobbingError): u"""Signals that a specified file doesn't start with correct prefix""" pass
[docs]def _glob_get_prefix_regexs(glob_str): u"""Return list of regexps equivalent to prefixes of glob_str""" # Internal. Used by glob_get_sf. glob_parts = glob_str.split(u"/") if u"" in glob_parts[1:-1]: # "" OK if comes first or last, as in /foo/ raise GlobbingError(u"Consecutive '/'s found in globbing string " + glob_str) prefixes = [u"/".join(glob_parts[:i + 1]) for i in range(len(glob_parts))] # we must make exception for root "/", only dir to end in slash if prefixes[0] == u"": prefixes[0] = u"/" return list(map(glob_to_regex, prefixes))
[docs]def select_fn_from_glob(glob_str, include, ignore_case=False): u"""Return a function test_fn(path) which tests whether path matches glob, as per the Unix shell rules, taking as arguments a path, a glob string and include (0 indicating that the glob string is an exclude glob and 1 indicating that it is an include glob, returning: 0 - if the file should be excluded 1 - if the file should be included 2 - if the folder should be scanned for any included/excluded files None - if the selection function has nothing to say about the file The basic idea is to turn glob_str into a regular expression, and just use the normal regular expression. There is a complication because the selection function should return '2' (scan) for directories which may contain a file which matches the glob_str. So we break up the glob string into parts, and any file which matches an initial sequence of glob parts gets scanned. Thanks to Donovan Baarda who provided some code which did some things similar to this. Note: including a folder implicitly includes everything within it. """ assert isinstance(glob_str, str) glob_ends_w_slash = False if glob_str == u"/": # If the glob string is '/', it implicitly includes everything glob_str = u"/**" elif glob_str[-1] == u"/": glob_ends_w_slash = True # Remove trailing / from directory name (unless that is the entire # string) glob_str = glob_str[:-1] flags = 0 if ignore_case: flags = re.IGNORECASE re_comp = lambda r: re.compile(r, re.S | flags) # matches what glob matches and any files in directory # Resulting regular expression is: # ^ string must be at the beginning of path # string translated into regex # ($|/) nothing must follow except for the end of the string, newline or / # Note that the "/" at the end of the regex means that it will match # if the glob matches a parent folders of path, i.e. including a folder # includes everything within it. glob_comp_re = re_comp(u"^%s($|/)" % glob_to_regex(glob_str)) if glob_ends_w_slash: # Creates a version of glob_comp_re that does not match folder contents # This can be used later to check that an exact match is actually a # folder, rather than a file. glob_comp_re_exact = re_comp(u"^%s($)" % glob_to_regex(glob_str)) if glob_str.find(u"**") != -1: # glob_str has a ** in it glob_str = glob_str[:glob_str.find(u"**") + 2] # truncate after ** # Below regex is translates to: # ^ string must be at the beginning of path # the regexs corresponding to the parent directories of glob_str # $ nothing must follow except for the end of the string or newline scan_comp_re = re_comp(u"^(%s)$" % u"|".join(_glob_get_prefix_regexs(glob_str))) def test_fn(path): assert not path.uc_name[-1] == u"/" or path.uc_name == u"/", \ u" should never end in '/' during normal operation for " \ u"normal paths (except '/' alone)\n" \ u" here is " + path.uc_name + u" and glob is " + glob_str if glob_comp_re.match(path.uc_name): # Path matches glob, or is contained within a matching folder if not glob_ends_w_slash: return include else: # Glob ended with a /, so we need to check any exact match was # a folder if glob_comp_re_exact.match(path.uc_name): # Not an included file/folder, so must be a folder to match if path.isdir(): # Is a directory, so all is well return include else: # Exact match and not a folder return None else: # An included file/folder, so normal approach is fine return include elif include == 1 and scan_comp_re.match(path.uc_name): return 2 else: return None return test_fn
[docs]def glob_to_regex(pat): u"""Returned regular expression equivalent to shell glob pat Currently only the ?, *, [], and ** expressions are supported. Ranges like [a-z] are currently unsupported. There is no way to quote these special characters. This function taken with minor modifications from by Donovan Baarda. """ # Internal. Used by glob_get_sf, glob_get_prefix_res and unit tests. assert isinstance(pat, str) i, n, res = 0, len(pat), u'' while i < n: c, s = pat[i], pat[i:i + 2] i = i + 1 if s == u'**': res = res + u'.*' i = i + 1 elif c == u'*': res = res + u'[^/]*' elif c == u'?': res = res + u'[^/]' elif c == u'[': j = i if j < n and pat[j] in u'!^': j = j + 1 if j < n and pat[j] == u']': j = j + 1 while j < n and pat[j] != u']': j = j + 1 if j >= n: res = res + u'\\[' # interpret the [ literally else: # Deal with inside of [..] stuff = pat[i:j].replace(u'\\', u'\\\\') i = j + 1 if stuff[0] in u'!^': stuff = u'^' + stuff[1:] res = res + u'[' + stuff + u']' else: res = res + re.escape(c) return res