diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..610b30d --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,26 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Use Test Resources", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/csvbyname/csvbyname.py", + "console": "integratedTerminal", + "args": [ + "${workspaceFolder}/tests/resources", + "${workspaceFolder}/output.csv", + "-r", + "-p", + "group_num:group(\\d)-\\w-\\d+\\.txt", + "group(\\d)-(?P\\w)-(?P\\d+)\\.txt", + "-V", + "DEBUG" + ], + "justMyCode": true + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 786c995..de288e1 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,3 @@ { - "python.formatting.provider": "none", - "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter" - } + "python.formatting.provider": "black" } \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..78a9d97 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,36 @@ +pipeline { + agent any + stages { + stage("clean") { + steps { + sh 'rm -rf ./dist/*' + } + } + stage("install") { + steps { + sh 'mamba env update --file environment.yml' + sh 'echo "mamba activate csvbyname" >> ~/.bashrc' + } + } + stage("build") { + steps { + sh "python -m build" + } + } + stage("test") { + steps { + sh "pip install dist/*.whl" + } + } + stage("publish") { + when { + branch '**/master' + } + steps { + withCredentials([usernamePassword(credentialsId: 'rs-git-package-registry-ydeng', passwordVariable: 'PASS', usernameVariable: 'USER')]) { + sh "python -m twine upload --repository-url https://git.reslate.systems/api/packages/${USER}/pypi -u ${USER} -p ${PASS} --non-interactive --disable-progress-bar --verbose dist/*" + } + } + } + } +} \ No newline at end of file diff --git a/csvbyname/__init__.py b/csvbyname/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/csvbyname/csvbyname.py b/csvbyname/csvbyname.py index faf1863..4aa6163 100644 --- a/csvbyname/csvbyname.py +++ b/csvbyname/csvbyname.py @@ -1,12 +1,48 @@ import argparse +import csv import os import re +from typing import Iterable +import exceptions +import logging -def matcher(path: str, regex_groups: list[str]): - matches = [] - for regex in regex_groups: - matches[path] +def matcher(full_path: str, use_full_path: bool, regex_groups: list[str]): + captured_properties = {} + for regex_and_group in regex_groups: + match_assume_named = re.match( + regex_and_group, full_path if use_full_path else os.path.basename(full_path) + ) + if match_assume_named and len(match_assume_named.groupdict()) > 0: + for group, val in match_assume_named.groupdict().items(): + if group not in captured_properties: + captured_properties[group] = val + else: + raise exceptions.InvalidPropertiesException( + f'Duplicate capture group names found: "{group}"' + ) + else: + unnamed_split = regex_and_group.split(":", 1) + if len(unnamed_split) < 2: + logger.debug( + 'File at "%s" could not be matched by regex "%s" ' + "and will be skipped", + full_path, + regex_and_group, + ) + continue + group, regex = unnamed_split + unnamed_match = re.match( + regex, full_path if use_full_path else os.path.basename(full_path) + ) + if unnamed_match: + if group not in captured_properties: + captured_properties[group] = unnamed_match.group(1) + else: + raise exceptions.InvalidPropertiesException( + f'Duplicate capture group names found: "{group}"' + ) + return captured_properties def collect_files( @@ -17,44 +53,53 @@ def collect_files( regex_groups: list[str], ): collected = {} - - def matcher(full_path, use_full_path): - return [ - re.match( - regex, full_path if use_full_path else os.path.basename(full_path) - ).groups(1) - for regex in regex_groups - ] - + pkeys = set() for item in os.listdir(dir_path): full_path = os.path.join(dir_path, item) if os.path.isdir(full_path): if include_folders: - if full_path not in collected: - collected[full_path] = set() - collected = collected[full_path] | matcher(full_path, entire_path) - collected = collected | collect_files( - full_path, include_folders, entire_path, recursive, regex_groups - ) + collected[full_path] = matcher(full_path, entire_path, regex_groups) + pkeys.update(collected[full_path]) + if recursive: + collected = collected | collect_files( + full_path, include_folders, entire_path, recursive, regex_groups + ) elif os.path.isfile(full_path): - if full_path not in collected: - collected[full_path] = set() - collected = collected[full_path] | matcher(full_path, entire_path) + collected[full_path] = matcher(full_path, entire_path, regex_groups) + pkeys.update(collected[full_path]) + return collected, pkeys -def write_collected_to_csv(output_path: str, collected: dict[str, dict[str, str]]): - # TODO Finish writing collected files/paths to CSV. - pass +def write_collected_to_csv( + output_path: str, collected: dict[str, dict[str, str]], property_keys: Iterable[str] +): + with open(output_path, "w") as output_fd: + s_property_keys = sorted(property_keys) + header = ["path", *s_property_keys] + writer = csv.writer(output_fd) + writer.writerow(header) + for full_path, properties in collected.items(): + writer.writerow( + [ + full_path, + *( + properties[k] if k in properties else "N/A" + for k in s_property_keys + ), + ] + ) def run(args): - collect_files( + logger.info('Collecting files from "%s"', args.directory) + collected, pkeys = collect_files( args.directory, args.include_folders, args.entire_path, args.recursive, - args.add_regex_property, + args.add_re_property, ) + write_collected_to_csv(args.output, collected, pkeys) def main(): @@ -74,7 +119,7 @@ def main(): ) argparser.add_argument( "-l", - "--include-folder", + "--include-folders", help="Include folders in the cataloguing process", action="store_true", required=False, @@ -98,11 +143,28 @@ def main(): ) argparser.add_argument( "-p", - "--add-regex-property", + "--add-re-property", help="Add a property in the resulting CSV obtained from the first capture " - "group of the given REGEX in the following format:\n property-name:regex", + "group of the given REGEX in the following format:\n property-name:regex.\n" + "Alternatively, use named REGEX groups.", nargs="+", + type=str, + ) + argparser.add_argument( + "-V", + "--verbosity", + help="Set the verbosity of the logging", + type=str, + required=False, + default="INFO", ) args = argparser.parse_args() + logging.basicConfig(level=args.verbosity.upper()) + global logger + logger = logging.getLogger(__name__) run(args) + + +if __name__ == "__main__": + main() diff --git a/csvbyname/exceptions.py b/csvbyname/exceptions.py new file mode 100644 index 0000000..65c3fc7 --- /dev/null +++ b/csvbyname/exceptions.py @@ -0,0 +1,2 @@ +class InvalidPropertiesException(Exception): + pass \ No newline at end of file diff --git a/environment.yaml b/environment.yaml deleted file mode 100644 index 2a6a99a..0000000 --- a/environment.yaml +++ /dev/null @@ -1,37 +0,0 @@ -name: /home/ydeng/csvbyname/env -channels: - - conda-forge -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu - - black=23.3.0=py311h38be061_0 - - bzip2=1.0.8=h7f98852_4 - - ca-certificates=2022.12.7=ha878542_0 - - click=8.1.3=unix_pyhd8ed1ab_2 - - ld_impl_linux-64=2.40=h41732ed_0 - - libexpat=2.5.0=hcb278e6_1 - - libffi=3.4.2=h7f98852_5 - - libgcc-ng=12.2.0=h65d4601_19 - - libgomp=12.2.0=h65d4601_19 - - libnsl=2.0.0=h7f98852_0 - - libsqlite=3.40.0=h753d276_0 - - libuuid=2.38.1=h0b41bf4_0 - - libzlib=1.2.13=h166bdaf_4 - - mypy_extensions=1.0.0=pyha770c72_0 - - ncurses=6.3=h27087fc_1 - - openssl=3.1.0=h0b41bf4_0 - - packaging=23.1=pyhd8ed1ab_0 - - pathspec=0.11.1=pyhd8ed1ab_0 - - pip=23.1=pyhd8ed1ab_0 - - platformdirs=3.2.0=pyhd8ed1ab_0 - - python=3.11.3=h2755cc3_0_cpython - - python_abi=3.11=3_cp311 - - readline=8.2=h8228510_1 - - setuptools=67.6.1=pyhd8ed1ab_0 - - tk=8.6.12=h27826a3_0 - - typing-extensions=4.5.0=hd8ed1ab_0 - - typing_extensions=4.5.0=pyha770c72_0 - - tzdata=2023c=h71feb2d_0 - - wheel=0.40.0=pyhd8ed1ab_0 - - xz=5.2.6=h166bdaf_0 -prefix: /home/ydeng/csvbyname/env diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..ad5d058 --- /dev/null +++ b/environment.yml @@ -0,0 +1,8 @@ +name: csvbyname +channels: + - conda-forge +dependencies: + - build=0.7.0 + - pytest=7.2.2 + - twine=4.0.2 + - python=3.9 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..5f8f5e3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = ["setuptools", "wheel"] \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b3f6052 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,10 @@ +[metadata] +name = csvbyname +version = 0.0.1 + +[options] +packages = csvbyname + +[options.entry_points] +console_scripts = + csvbyname = csvbyname.csvbyname:main \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6068493 --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup()