diff --git a/src/filepattern/cpp/internal/internal_pattern.cpp b/src/filepattern/cpp/internal/internal_pattern.cpp index 55af8e0a..318ebfe2 100644 --- a/src/filepattern/cpp/internal/internal_pattern.cpp +++ b/src/filepattern/cpp/internal/internal_pattern.cpp @@ -1,4 +1,5 @@ #include "internal_pattern.hpp" +#include "../util/alphanum.hpp" using namespace std; @@ -216,8 +217,14 @@ void InternalPattern::sortFiles(){ if (this->valid_files_.size() == 0) return; - sort(this->valid_files_.begin(), this->valid_files_.end(), [](Tuple& m1, Tuple& m2){ - return get<1>(m1)[0] < get<1>(m2)[0]; + doj::alphanum_less comparator; // alphanum comparison algorithm for strings with numeric and alphabetic chars + sort(this->valid_files_.begin(), this->valid_files_.end(), [comparator](Tuple& m1, Tuple& m2){ + + #ifdef JAVA_BINDING + return comparator(get<1>(m1)[0], get<1>(m2)[0]); + #else + return comparator(get<1>(m1)[0].u8string(), get<1>(m2)[0].u8string()); + #endif }); } diff --git a/src/filepattern/cpp/util/alphanum.hpp b/src/filepattern/cpp/util/alphanum.hpp new file mode 100644 index 00000000..785e1220 --- /dev/null +++ b/src/filepattern/cpp/util/alphanum.hpp @@ -0,0 +1,325 @@ +#ifndef ALPHANUM__HPP +#define ALPHANUM__HPP + +/* +from https://github.com/readium/readium-sdk/tree/master on 5/8/2024 + +The Alphanum Algorithm is an improved sorting algorithm for strings +containing numbers. Instead of sorting numbers in ASCII order like a +standard sort, this algorithm sorts numbers in numeric order. + +The Alphanum Algorithm is discussed at http://www.DaveKoelle.com + +This implementation is Copyright (c) 2008 Dirk Jagdmann . +It is a cleanroom implementation of the algorithm and not derived by +other's works. In contrast to the versions written by Dave Koelle this +source code is distributed with the libpng/zlib license. + +This software is provided 'as-is', without any express or implied +warranty. In no event will the authors be held liable for any damages +arising from the use of this software. + +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it +freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you + must not claim that you wrote the original software. If you use + this software in a product, an acknowledgment in the product + documentation would be appreciated but is not required. + + 2. Altered source versions must be plainly marked as such, and + must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. */ + +/* $Header: /code/doj/alphanum.hpp,v 1.3 2008/01/28 23:06:47 doj Exp $ */ + +#include +#include +#include +#include + +#ifdef ALPHANUM_LOCALE +#include +#endif + +#ifdef DOJDEBUG +#include +#include +#endif + +// TODO: make comparison with hexadecimal numbers. Extend the alphanum_comp() function by traits to choose between decimal and hexadecimal. + +namespace doj +{ + + // anonymous namespace for functions we use internally. But if you + // are coding in C, you can use alphanum_impl() directly, since it + // uses not C++ features. + namespace { + + // if you want to honour the locale settings for detecting digit + // characters, you should define ALPHANUM_LOCALE +#ifdef ALPHANUM_LOCALE + /** wrapper function for ::isdigit() */ + bool alphanum_isdigit(int c) + { + return isdigit(c); + } +#else + /** this function does not consider the current locale and only + works with ASCII digits. + @return true if c is a digit character + */ + bool alphanum_isdigit(const char c) + { + return c>='0' && c<='9'; + } +#endif + + /** + compare l and r with strcmp() semantics, but using + the "Alphanum Algorithm". This function is designed to read + through the l and r strings only one time, for + maximum performance. It does not allocate memory for + substrings. It can either use the C-library functions isdigit() + and atoi() to honour your locale settings, when recognizing + digit characters when you "#define ALPHANUM_LOCALE=1" or use + it's own digit character handling which only works with ASCII + digit characters, but provides better performance. + + @param l NULL-terminated C-style string + @param r NULL-terminated C-style string + @return negative if lr + */ + int alphanum_impl(const char *l, const char *r) + { + enum mode_t { STRING, NUMBER } mode=STRING; + + while(*l && *r) + { + if(mode == STRING) + { + char l_char, r_char; + while((l_char=*l) && (r_char=*r)) + { + // check if this are digit characters + const bool l_digit=alphanum_isdigit(l_char), r_digit=alphanum_isdigit(r_char); + // if both characters are digits, we continue in NUMBER mode + if(l_digit && r_digit) + { + mode=NUMBER; + break; + } + // if only the left character is a digit, we have a result + if(l_digit) return -1; + // if only the right character is a digit, we have a result + if(r_digit) return +1; + // compute the difference of both characters + const int diff=l_char - r_char; + // if they differ we have a result + if(diff != 0) return diff; + // otherwise process the next characters + ++l; + ++r; + } + } + else // mode==NUMBER + { +#ifdef ALPHANUM_LOCALE + // get the left number + char *end; + unsigned long l_int=strtoul(l, &end, 0); + l=end; + + // get the right number + unsigned long r_int=strtoul(r, &end, 0); + r=end; +#else + // get the left number + unsigned long l_int=0; + while(*l && alphanum_isdigit(*l)) + { + // TODO: this can overflow + l_int=l_int*10 + *l-'0'; + ++l; + } + + // get the right number + unsigned long r_int=0; + while(*r && alphanum_isdigit(*r)) + { + // TODO: this can overflow + r_int=r_int*10 + *r-'0'; + ++r; + } +#endif + + // if the difference is not equal to zero, we have a comparison result + const long diff=l_int-r_int; + if(diff != 0) + return diff; + + // otherwise we process the next substring in STRING mode + mode=STRING; + } + } + + if(*r) return -1; + if(*l) return +1; + return 0; + } + + } + + /** + Compare left and right with the same semantics as strcmp(), but with the + "Alphanum Algorithm" which produces more human-friendly + results. The classes lT and rT must implement "std::ostream + operator<< (std::ostream&, const Ty&)". + + @return negative if leftright. + */ + template + int alphanum_comp(const lT& left, const rT& right) + { + std::ostringstream l; l << left; + std::ostringstream r; r << right; + return alphanum_impl(l.str().c_str(), r.str().c_str()); + } + + /** + Compare l and r with the same semantics as strcmp(), but with + the "Alphanum Algorithm" which produces more human-friendly + results. + + @return negative if lr. + */ + template <> + int alphanum_comp(const std::string& l, const std::string& r) + { +#ifdef DOJDEBUG + std::clog << "alphanum_comp " << l << "," << r << std::endl; +#endif + return alphanum_impl(l.c_str(), r.c_str()); + } + + //////////////////////////////////////////////////////////////////////////// + + // now follow a lot of overloaded alphanum_comp() functions to get a + // direct call to alphanum_impl() upon the various combinations of c + // and c++ strings. + + /** + Compare l and r with the same semantics as strcmp(), but with + the "Alphanum Algorithm" which produces more human-friendly + results. + + @return negative if lr. + */ + int alphanum_comp(char* l, char* r) + { + assert(l); + assert(r); +#ifdef DOJDEBUG + std::clog << "alphanum_comp " << l << "," << r << std::endl; +#endif + return alphanum_impl(l, r); + } + + int alphanum_comp(const char* l, const char* r) + { + assert(l); + assert(r); +#ifdef DOJDEBUG + std::clog << "alphanum_comp " << l << "," << r << std::endl; +#endif + return alphanum_impl(l, r); + } + + int alphanum_comp(char* l, const char* r) + { + assert(l); + assert(r); +#ifdef DOJDEBUG + std::clog << "alphanum_comp " << l << "," << r << std::endl; +#endif + return alphanum_impl(l, r); + } + + int alphanum_comp(const char* l, char* r) + { + assert(l); + assert(r); +#ifdef DOJDEBUG + std::clog << "alphanum_comp " << l << "," << r << std::endl; +#endif + return alphanum_impl(l, r); + } + + int alphanum_comp(const std::string& l, char* r) + { + assert(r); +#ifdef DOJDEBUG + std::clog << "alphanum_comp " << l << "," << r << std::endl; +#endif + return alphanum_impl(l.c_str(), r); + } + + int alphanum_comp(char* l, const std::string& r) + { + assert(l); +#ifdef DOJDEBUG + std::clog << "alphanum_comp " << l << "," << r << std::endl; +#endif + return alphanum_impl(l, r.c_str()); + } + + int alphanum_comp(const std::string& l, const char* r) + { + assert(r); +#ifdef DOJDEBUG + std::clog << "alphanum_comp " << l << "," << r << std::endl; +#endif + return alphanum_impl(l.c_str(), r); + } + + int alphanum_comp(const char* l, const std::string& r) + { + assert(l); +#ifdef DOJDEBUG + std::clog << "alphanum_comp " << l << "," << r << std::endl; +#endif + return alphanum_impl(l, r.c_str()); + } + + //////////////////////////////////////////////////////////////////////////// + + template + struct binary_function + { + using first_argument_type = Arg1; + using second_argument_type = Arg2; + using result_type = Result; + }; + + /** + Functor class to compare two objects with the "Alphanum + Algorithm". If the objects are no std::string, they must + implement "std::ostream operator<< (std::ostream&, const Ty&)". + */ + template + struct alphanum_less : public binary_function + { + bool operator()(const Ty& left, const Ty& right) const + { + return alphanum_comp(left, right) < 0; + } + }; + +} + + +#endif \ No newline at end of file diff --git a/src/filepattern/filepattern.py b/src/filepattern/filepattern.py index 827b18b4..fdc2c6a5 100644 --- a/src/filepattern/filepattern.py +++ b/src/filepattern/filepattern.py @@ -10,6 +10,7 @@ class PatternObject: def __init__(self, file_pattern, block_size): self._file_pattern = file_pattern self._block_size = block_size + self._pydantic_iterator = False def get_matching(self, kwargs) -> List[Tuple[Dict[str, Union[int, float, str]], List[os.PathLike]]]: """Get all filenames matching specific values @@ -200,9 +201,9 @@ def __call__(self, if var not in vars: raise ValueError("Variable \"" + var + "\" is not a valid variable. The variables are: " + str(vars) + ".") - self.pydantic_iterator = pydantic_output + self._pydantic_iterator = pydantic_output - if (self.pydantic_iterator): + if (self._pydantic_iterator): if (self.__len__() > 0): file = self.__getitem__(0) @@ -213,7 +214,7 @@ def __call__(self, # get variables variables = self.get_variables() - + variable_map = {} # add paths to map @@ -258,6 +259,10 @@ def __iter__(self) -> Union[List[Tuple[List[Tuple[str, Union[str, int, float]]], Tuple[Dict[str, Union[int, float, str]], List[os.PathLike]]] : Returns single file when group_by is not used and list of files otherwise """ + # Set pydantic_iterator value if iter is called directly + if (self._pydantic_iterator is None): + self._pydantic_iterator = False + if self._block_size == "": if (self._file_pattern.isGrouped()): @@ -266,7 +271,7 @@ def __iter__(self) -> Union[List[Tuple[List[Tuple[str, Union[str, int, float]]], iterator = self._file_pattern.iterator() for file in iterator: - if (self.pydantic_iterator): + if (self._pydantic_iterator): if (isinstance(file[0], dict)): map_with_path = file[0] @@ -299,7 +304,7 @@ def __iter__(self) -> Union[List[Tuple[List[Tuple[str, Union[str, int, float]]], if self._length() == 0: break - if (self.pydantic_iterator): + if (self._pydantic_iterator): if (isinstance(block[0], dict)): map_with_path = block[0] diff --git a/tests/test_filepattern.py b/tests/test_filepattern.py index 372cef52..3cce38b5 100644 --- a/tests/test_filepattern.py +++ b/tests/test_filepattern.py @@ -20,11 +20,14 @@ class TestFilePattern(): test_generate_filepattern_data.generate_data() test_generate_filepattern_data.generate_channel_data() + test_generate_filepattern_data.generate_sorted_data() root_directory = os.path.dirname(os.path.realpath(__file__)) path = root_directory + '/test_data/data100' + sorted_path = root_directory + '/test_data/sorted_data' + old_pattern = 'img_r{rrr}_c{ccc}.tif' patterns = ['img_r00{r:d}_c{c:ddd}.tif', 'img_r{r:d+}_c{c:d+}.tif', old_pattern] @@ -344,6 +347,39 @@ def test_recursive_multi_directory_regex_fp(self): assert fp_data.test_recursive_directory_fp[i][0]["directory"] == result[i][0]["directory"] assert str(os.path.basename(fp_data.test_recursive_directory_fp[i][1][0])) == os.path.basename(result[i][1][0]) + def test_file_pattern_iter(self): + + for pattern in self.patterns: + + files = fp.FilePattern(self.path, pattern) + + result = [] + + for file in files: # test iterator without call + result.append(file) + + assert (len(fp_data.test_fp) == len(result)) + + assert (len(result) == len(files)) # test length operator + + for i in range(len(result)): + assert fp_data.test_fp[i][0]["r"] == result[i][0]["r"] + assert fp_data.test_fp[i][0]["c"] == result[i][0]["c"] + assert os.path.basename(fp_data.test_fp[i][1][0]) == os.path.basename(result[i][1][0]) + + # test that numeric only, double digit numbers are sorted properly + def test_file_pattern_sorting(self): + + sorted_pattern = '{index:d+}.tif' + files = fp.FilePattern(self.sorted_path, sorted_pattern) + + indices = [] + for index, file in files(): + indices.append(index['index']) + + assert sorted(indices) == indices + + # Todo: These tests need new data to be added after replacing the old version of filepattern. """ def test_group_by_multi(self): diff --git a/tests/test_generate_filepattern_data.py b/tests/test_generate_filepattern_data.py index 8b5385fa..b18077d8 100644 --- a/tests/test_generate_filepattern_data.py +++ b/tests/test_generate_filepattern_data.py @@ -100,10 +100,40 @@ def generate_channel_data(): print("Files generated.") +def generate_sorted_data(): + MAX = 30 + length = 0 + + directory = 'test_data' + root_directory = os.path.dirname(os.path.realpath(__file__)) + path = os.path.join(root_directory, directory) + data_path = path + '/sorted_data' + + try: + os.mkdir(path) + print('Data directory created at ' + path) + except FileExistsError: + print("Data directory already exists") + + try: + os.mkdir(data_path) + print('Data directory created at ' + data_path) + except FileExistsError: + print("Data directory already exists") + + for i in range(0, MAX): + + data_name = '{}.tif'.format(str(i)) + f = open(data_path + '/' + data_name, 'w+') + f.close() + + print(str(length) + " files generated.") + if __name__ == '__main__': generate_data() generate_channel_data() + generate_sorted_data() MAX = 3 @@ -113,6 +143,7 @@ def generate_channel_data(): path = os.path.join(root_directory, directory) data_path = path + '/data' recursive_path = path + '/recursive_data' +sorted_data = path + '/sorted' sp_data = path + "/sp_data.txt" @@ -120,6 +151,7 @@ def generate_channel_data(): os.mkdir(path) os.mkdir(data_path) os.mkdir(recursive_path) + os.mkdir(sorted_data) print('Data directory created at ' + path) except FileExistsError: print("Data directory already exists")