diff --git a/.gitea/workflows/main.yml b/.gitea/workflows/main.yml index bffb70d..338c29c 100644 --- a/.gitea/workflows/main.yml +++ b/.gitea/workflows/main.yml @@ -2,6 +2,9 @@ name: CI +env: + SKIP_MAKE_SETUP_CHECK: 'true' + # Controls when the workflow will run on: # Triggers the workflow on push or pull request events but only for the main branch diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 89c423e..5d34921 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -2,6 +2,9 @@ name: Upload Python Package permissions: contents: write +env: + SKIP_MAKE_SETUP_CHECK: 'true' + on: push: # Sequence of patterns matched against refs/tags diff --git a/Makefile b/Makefile index cca439a..a6a6981 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ .PHONY: issetup issetup: - @[ -f .git/hooks/commit-msg ] || (echo "You must run 'make setup' first to initialize the repo!" && exit 1) + @[ -f .git/hooks/commit-msg ] || [ -v SKIP_MAKE_SETUP_CHECK ] || (echo "You must run 'make setup' first to initialize the repo!" && exit 1) .PHONY: setup setup: diff --git a/TextParser.Tests/TextParserTests.cs b/TextParser.Tests/TextParserTests.cs new file mode 100644 index 0000000..16b125b --- /dev/null +++ b/TextParser.Tests/TextParserTests.cs @@ -0,0 +1,93 @@ +namespace TextParser.Tests; + +using Parsing; +using Parsing.Schema; +using Parsing.Schema.BuildingBlocks; +using Parsing.Tokenization; + +public class TextParserTests +{ + private const string testInput1 = "2 4 6 8"; + private const string testInput2 = "2 ab ba 8 cd dc"; + + [Fact] + public void TestSimpleRepetition() + { + var schemaBuilder = new InputSchemaBuilder(); + var schema = schemaBuilder + .Repeat(4) + .Expect(InputType.Integer) + .EndRepetition() + .Build(); + + var parser = new TextParser(schema); + var tokens = parser.ParseLine(testInput1); + + Assert.Equal(4, tokens.Count); + Assert.Equal(InputType.Integer, tokens[0].GetInputType()); + Assert.Equal(InputType.Integer, tokens[1].GetInputType()); + Assert.Equal(InputType.Integer, tokens[2].GetInputType()); + Assert.Equal(InputType.Integer, tokens[3].GetInputType()); + Assert.Equal(2, (tokens[0] as IntegerToken)?.GetValue()); + Assert.Equal(4, (tokens[1] as IntegerToken)?.GetValue()); + Assert.Equal(6, (tokens[2] as IntegerToken)?.GetValue()); + Assert.Equal(8, (tokens[3] as IntegerToken)?.GetValue()); + } + + [Fact] + public void TestSimpleInput() + { + var schemaBuilder = new InputSchemaBuilder(); + var schema = schemaBuilder + .Expect(InputType.Integer) + .Expect(InputType.Integer) + .Expect(InputType.Integer) + .Expect(InputType.Integer) + .Build(); + + var parser = new TextParser(schema); + var tokens = parser.ParseLine(testInput1); + + Assert.Equal(4, tokens.Count); + Assert.Equal(InputType.Integer, tokens[0].GetInputType()); + Assert.Equal(InputType.Integer, tokens[1].GetInputType()); + Assert.Equal(InputType.Integer, tokens[2].GetInputType()); + Assert.Equal(InputType.Integer, tokens[3].GetInputType()); + Assert.Equal(2, (tokens[0] as IntegerToken)?.GetValue()); + Assert.Equal(4, (tokens[1] as IntegerToken)?.GetValue()); + Assert.Equal(6, (tokens[2] as IntegerToken)?.GetValue()); + Assert.Equal(8, (tokens[3] as IntegerToken)?.GetValue()); + } + + + [Fact] + public void TestNestedRepetition() + { + var schemaBuilder = new InputSchemaBuilder(); + var schema = schemaBuilder + .Repeat(2) + .Expect(InputType.Integer) + .Repeat(2) + .Expect(InputType.String) + .EndRepetition() + .EndRepetition() + .Build(); + + var parser = new TextParser(schema); + var tokens = parser.ParseLine(testInput2); + + Assert.Equal(6, tokens.Count); + Assert.Equal(InputType.Integer, tokens[0].GetInputType()); + Assert.Equal(InputType.String, tokens[1].GetInputType()); + Assert.Equal(InputType.String, tokens[2].GetInputType()); + Assert.Equal(InputType.Integer, tokens[3].GetInputType()); + Assert.Equal(InputType.String, tokens[4].GetInputType()); + Assert.Equal(InputType.String, tokens[5].GetInputType()); + Assert.Equal(2, (tokens[0] as IntegerToken)?.GetValue()); + Assert.Equal("ab", (tokens[1] as StringToken)?.GetValue()); + Assert.Equal("ba", (tokens[2] as StringToken)?.GetValue()); + Assert.Equal(8, (tokens[3] as IntegerToken)?.GetValue()); + Assert.Equal("cd", (tokens[4] as StringToken)?.GetValue()); + Assert.Equal("dc", (tokens[5] as StringToken)?.GetValue()); + } +} diff --git a/TextParser.Tests/UnitTest1.cs b/TextParser.Tests/UnitTest1.cs deleted file mode 100644 index 643b108..0000000 --- a/TextParser.Tests/UnitTest1.cs +++ /dev/null @@ -1,14 +0,0 @@ -namespace TextParser.Tests; - -using TextParser; - -public class UnitTest1 -{ - [Fact] - public void Test1() - { - var hwp = new HelloWorldProvider(); - Assert.Equal("Hello, Simon!", hwp.GetHelloWorld("Simon")); - Assert.Equal("Hello world!", hwp.GetHelloWorld()); - } -} diff --git a/TextParser/HelloWorldProvider.cs b/TextParser/HelloWorldProvider.cs deleted file mode 100644 index a728aef..0000000 --- a/TextParser/HelloWorldProvider.cs +++ /dev/null @@ -1,16 +0,0 @@ -namespace TextParser; - -public class HelloWorldProvider -{ - public string GetHelloWorld(string? name = null) - { - if (string.IsNullOrEmpty(name)) - { - return "Hello world!"; - } - else - { - return $"Hello, {name}!"; - } - } -} diff --git a/TextParser/Schema/BlockType.cs b/TextParser/Schema/BlockType.cs new file mode 100644 index 0000000..c67ba4a --- /dev/null +++ b/TextParser/Schema/BlockType.cs @@ -0,0 +1,6 @@ +namespace Parsing.Schema; + +public enum BlockType +{ + Integer, String, FixedRepetition, GreedyRepetition +} diff --git a/TextParser/Schema/BuildingBlocks/BuildingBlockBase.cs b/TextParser/Schema/BuildingBlocks/BuildingBlockBase.cs new file mode 100644 index 0000000..f046678 --- /dev/null +++ b/TextParser/Schema/BuildingBlocks/BuildingBlockBase.cs @@ -0,0 +1,28 @@ +namespace Parsing.Schema.BuildingBlocks; + +using Parsing.Tokenization; + +abstract class BuildingBlockBase : IBuildingBlock +{ + public BuildingBlockBase() + { + } + + public abstract IToken ParseWord(InputProvider inputs); + + public abstract bool CanParseWord(InputProvider inputs); + + public abstract BlockType GetBlockType(); + + public virtual bool IsRepetitionType() + { + return false; + } + + public virtual bool CheckIsDoneParsingAndReset() + { + // most blocks are always done parsing after consuming a token + // repetition blocks can consume multiple tokens + return true; + } +} \ No newline at end of file diff --git a/TextParser/Schema/BuildingBlocks/FixedRepetitionBlock.cs b/TextParser/Schema/BuildingBlocks/FixedRepetitionBlock.cs new file mode 100644 index 0000000..1a057f2 --- /dev/null +++ b/TextParser/Schema/BuildingBlocks/FixedRepetitionBlock.cs @@ -0,0 +1,72 @@ +namespace Parsing.Schema.BuildingBlocks; + +using System.IO.Pipelines; +using Parsing.Tokenization; + +class FixedRepetitionBlock : BuildingBlockBase +{ + private InputSchema inputSchema; + private InputSchemaContext context; + + private int repetitionCount; + private int initRepetitionCount; + + public FixedRepetitionBlock(InputSchema inputSchema, int repetitionCount) + { + this.inputSchema = inputSchema; + this.repetitionCount = repetitionCount; + this.initRepetitionCount = repetitionCount; + this.context = this.inputSchema.CreateContext(); + } + + public override IToken ParseWord(InputProvider inputs) + { + var result = inputSchema.ProcessNextWord(context, inputs); + if (context.HasFinished) + { + this.repetitionCount--; + if (this.repetitionCount > 0) + { + this.context = this.inputSchema.CreateContext(); + } + } + return result; + } + + public override bool CanParseWord(InputProvider inputs) + { + bool result; + if (this.repetitionCount == 0) + { + result = false; + } + else + { + result = inputSchema.CanProcessNextWord(context, inputs); + } + + return result; + } + + public override BlockType GetBlockType() + { + return BlockType.FixedRepetition; + } + + public override bool IsRepetitionType() + { + return true; + } + + public override bool CheckIsDoneParsingAndReset() + { + // we are done parsing once all repetitions are exhausted + var result = this.repetitionCount == 0; + if (result) + { + this.repetitionCount = this.initRepetitionCount; + this.context = this.inputSchema.CreateContext(); + } + return result; + } +} \ No newline at end of file diff --git a/TextParser/Schema/BuildingBlocks/IBuildingBlock.cs b/TextParser/Schema/BuildingBlocks/IBuildingBlock.cs new file mode 100644 index 0000000..d6b3339 --- /dev/null +++ b/TextParser/Schema/BuildingBlocks/IBuildingBlock.cs @@ -0,0 +1,16 @@ +namespace Parsing.Schema.BuildingBlocks; + +using Parsing.Tokenization; + +public interface IBuildingBlock +{ + public IToken ParseWord(InputProvider inputs); + + public bool CanParseWord(InputProvider inputs); + + public BlockType GetBlockType(); + + public bool IsRepetitionType(); + + public bool CheckIsDoneParsingAndReset(); +} \ No newline at end of file diff --git a/TextParser/Schema/BuildingBlocks/IntegerBlock.cs b/TextParser/Schema/BuildingBlocks/IntegerBlock.cs new file mode 100644 index 0000000..ebaf4c0 --- /dev/null +++ b/TextParser/Schema/BuildingBlocks/IntegerBlock.cs @@ -0,0 +1,31 @@ +namespace Parsing.Schema.BuildingBlocks; + +using Parsing.Tokenization; + +class IntegerBlock : BuildingBlockBase +{ + + public IntegerBlock() + { + } + + public override IToken ParseWord(InputProvider inputs) + { + return new IntegerToken(inputs.YieldWord()); + } + + public override bool CanParseWord(InputProvider inputs) + { + using (inputs.GetLookaheadContext()) + { + int number = 0; + var success = int.TryParse(inputs.YieldWord(), out number); + return success; + } + } + + public override BlockType GetBlockType() + { + return BlockType.Integer; + } +} \ No newline at end of file diff --git a/TextParser/Schema/BuildingBlocks/StringBlock.cs b/TextParser/Schema/BuildingBlocks/StringBlock.cs new file mode 100644 index 0000000..dea4ae7 --- /dev/null +++ b/TextParser/Schema/BuildingBlocks/StringBlock.cs @@ -0,0 +1,26 @@ +namespace Parsing.Schema.BuildingBlocks; + +using Parsing.Tokenization; + +class StringBlock : BuildingBlockBase +{ + + public StringBlock() + { + } + + public override IToken ParseWord(InputProvider inputs) + { + return new StringToken(inputs.YieldWord()); + } + + public override bool CanParseWord(InputProvider inputs) + { + return true; + } + + public override BlockType GetBlockType() + { + return BlockType.String; + } +} \ No newline at end of file diff --git a/TextParser/Schema/InputSchema.cs b/TextParser/Schema/InputSchema.cs new file mode 100644 index 0000000..45cab2e --- /dev/null +++ b/TextParser/Schema/InputSchema.cs @@ -0,0 +1,71 @@ +namespace Parsing.Schema; + +using Parsing.Schema; +using Parsing.Schema.BuildingBlocks; +using Parsing.Tokenization; +using System.Collections; + +public class InputSchemaContext +{ + public int lastProcessedBlockIndex { get; set; } = 0; + public bool HasFinished { get; set; } = false; +} + +public class InputSchema +{ + private List buildingBlocks; + + public InputSchema() + { + buildingBlocks = new List(); + } + + public void AddBuildingBlock(IBuildingBlock buildingBlock) + { + this.buildingBlocks.Add(buildingBlock); + } + + public IToken ProcessNextWord(InputSchemaContext currentContext, InputProvider inputs) + { + var nextBlock = this.buildingBlocks[currentContext.lastProcessedBlockIndex]; + var token = nextBlock.ParseWord(inputs); + if (!nextBlock.IsRepetitionType() || nextBlock.CheckIsDoneParsingAndReset()) + { + currentContext.lastProcessedBlockIndex++; + currentContext.HasFinished = currentContext.lastProcessedBlockIndex >= this.buildingBlocks.Count; + } + return token; + } + + public bool CanProcessNextWord(InputSchemaContext currentContext, InputProvider inputs) + { + using (inputs.GetLookaheadContext()) + { + if (currentContext.HasFinished) + { + return false; + } + var nextBlock = this.buildingBlocks[currentContext.lastProcessedBlockIndex]; + return nextBlock.CanParseWord(inputs); + } + } + + public IList ProcessWordList(string[] words) + { + List tokens = new List(); + InputProvider inputs = new InputProvider(words); + var overallContext = this.CreateContext(); + + while (this.CanProcessNextWord(overallContext, inputs)) + { + tokens.Add(this.ProcessNextWord(overallContext, inputs)); + } + + return tokens; + } + + public InputSchemaContext CreateContext() + { + return new InputSchemaContext(); + } +} diff --git a/TextParser/Schema/InputSchemaBuilder.cs b/TextParser/Schema/InputSchemaBuilder.cs new file mode 100644 index 0000000..4114ad2 --- /dev/null +++ b/TextParser/Schema/InputSchemaBuilder.cs @@ -0,0 +1,68 @@ +namespace Parsing.Schema; + +using Parsing.Schema.BuildingBlocks; + +public class InputSchemaBuilder +{ + private InputSchema schema = new InputSchema(); + + public InputSchemaBuilder() + { + } + + public InputSchemaBuilder Expect(InputType type) + { + IBuildingBlock block; + switch (type) + { + case InputType.String: + block = new StringBlock(); + break; + case InputType.Integer: + block = new IntegerBlock(); + break; + default: + throw new Exception("Unrecognized InputType"); + } + schema.AddBuildingBlock(block); + return this; + } + + public InputSchemaBuilder Repeat(int repetitionCount) + { + // add another layer of parsing + var newInputSchemaBuilder = new RepetitionSchemaBuilder(this); + newInputSchemaBuilder.NumRepetition = repetitionCount; + newInputSchemaBuilder.RepetitionType = RepetitionType.FixedRepetition; + + return newInputSchemaBuilder; + } + + public InputSchemaBuilder EndRepetition() + { + // return back to upper layer of parsing + var currentBuilder = this as RepetitionSchemaBuilder; + if (currentBuilder == null) + { + throw new Exception("Invalid repetition definitions!"); + } + var oldInputSchemaBuilder = currentBuilder.UpperLayerBuilder; + + var currentSchema = currentBuilder.Build(); + switch (currentBuilder.RepetitionType) + { + case RepetitionType.FixedRepetition: + oldInputSchemaBuilder.schema.AddBuildingBlock(new FixedRepetitionBlock(currentSchema, currentBuilder.NumRepetition)); + break; + default: + throw new Exception("Unrecognized RepetitionType"); + } + + return oldInputSchemaBuilder; + } + + public InputSchema Build() + { + return schema; + } +} diff --git a/TextParser/Schema/InputType.cs b/TextParser/Schema/InputType.cs new file mode 100644 index 0000000..2977d73 --- /dev/null +++ b/TextParser/Schema/InputType.cs @@ -0,0 +1,7 @@ +namespace Parsing.Schema; + +public enum InputType +{ + Integer = BlockType.Integer, + String = BlockType.String +} diff --git a/TextParser/Schema/RepetitionSchemaBuilder.cs b/TextParser/Schema/RepetitionSchemaBuilder.cs new file mode 100644 index 0000000..43f828f --- /dev/null +++ b/TextParser/Schema/RepetitionSchemaBuilder.cs @@ -0,0 +1,15 @@ +namespace Parsing.Schema; + +public class RepetitionSchemaBuilder : InputSchemaBuilder +{ + public RepetitionSchemaBuilder(InputSchemaBuilder upperLayerBuilder) + { + this.UpperLayerBuilder = upperLayerBuilder; + } + + public InputSchemaBuilder UpperLayerBuilder { get; set; } + + public int NumRepetition { get; set; } + + public RepetitionType RepetitionType { get; set; } +} diff --git a/TextParser/Schema/RepetitionType.cs b/TextParser/Schema/RepetitionType.cs new file mode 100644 index 0000000..698dcaa --- /dev/null +++ b/TextParser/Schema/RepetitionType.cs @@ -0,0 +1,7 @@ +namespace Parsing.Schema; + +public enum RepetitionType +{ + FixedRepetition = BlockType.FixedRepetition, + GreedyRepetition = BlockType.GreedyRepetition +} diff --git a/TextParser/TextParser.cs b/TextParser/TextParser.cs new file mode 100644 index 0000000..e770503 --- /dev/null +++ b/TextParser/TextParser.cs @@ -0,0 +1,36 @@ +namespace Parsing; + +using Parsing.Schema; +using Parsing.Tokenization; + +public class TextParser +{ + private string[] delimiters; + private bool removeEmptyEntries = false; + private InputSchema schema; + private InputSchemaContext context; + + public TextParser(InputSchema schema, string[]? delimiters = null, bool removeEmptyEntries = true) + { + this.delimiters = delimiters ?? new string[] { " " }; + this.removeEmptyEntries = removeEmptyEntries; + this.schema = schema; + this.context = this.schema.CreateContext(); + } + + private string[] ParseLineIntoWords(string line) + { + var options = StringSplitOptions.TrimEntries; + if (this.removeEmptyEntries) + { + options = options | StringSplitOptions.RemoveEmptyEntries; + } + return line.Split(this.delimiters, options); + } + + public IList ParseLine(string line) + { + var words = this.ParseLineIntoWords(line); + return this.schema.ProcessWordList(words); + } +} diff --git a/TextParser/Tokenization/IToken.cs b/TextParser/Tokenization/IToken.cs new file mode 100644 index 0000000..f3acc51 --- /dev/null +++ b/TextParser/Tokenization/IToken.cs @@ -0,0 +1,10 @@ +namespace Parsing.Tokenization; + +using Parsing.Schema; + +public interface IToken +{ + public string GetText(); + + public InputType GetInputType(); +} diff --git a/TextParser/Tokenization/IValueToken.cs b/TextParser/Tokenization/IValueToken.cs new file mode 100644 index 0000000..a217488 --- /dev/null +++ b/TextParser/Tokenization/IValueToken.cs @@ -0,0 +1,6 @@ +namespace Parsing.Tokenization; + +public interface IValueToken : IToken +{ + public T GetValue(); +} diff --git a/TextParser/Tokenization/InputProvider.cs b/TextParser/Tokenization/InputProvider.cs new file mode 100644 index 0000000..bafa7a9 --- /dev/null +++ b/TextParser/Tokenization/InputProvider.cs @@ -0,0 +1,56 @@ +namespace Parsing.Tokenization; + +public class InputProvider +{ + public class LookaheadContext : IDisposable + { + private InputProvider contextedProvider; + private int initialPosition; + + public LookaheadContext(InputProvider contextedProvider) + { + this.contextedProvider = contextedProvider; + this.initialPosition = contextedProvider.CurrentPosition; + contextedProvider.IsBeingLookedAhead = true; + } + + public void Dispose() + { + this.contextedProvider.CurrentPosition = this.initialPosition; + contextedProvider.IsBeingLookedAhead = false; + } + } + + private string[] words; + private bool IsBeingLookedAhead { get; set; } = false; + + private int CurrentPosition { get; set; } + + public InputProvider(string[] words) + { + this.words = words; + this.CurrentPosition = 0; + } + + public InputProvider.LookaheadContext GetLookaheadContext() + { + return new InputProvider.LookaheadContext(this); + } + + public string YieldWord() + { + Console.WriteLine("current words:"); + foreach (var word in words) + { + Console.WriteLine(word); + } + if (this.CurrentPosition > this.words.Length) + { + return string.Empty; + } + + var wordToProcess = this.words[this.CurrentPosition]; + this.CurrentPosition++; + return wordToProcess; + } +} diff --git a/TextParser/Tokenization/IntegerToken.cs b/TextParser/Tokenization/IntegerToken.cs new file mode 100644 index 0000000..a1311b1 --- /dev/null +++ b/TextParser/Tokenization/IntegerToken.cs @@ -0,0 +1,28 @@ +namespace Parsing.Tokenization; + +using Parsing.Schema; + +public class IntegerToken : IValueToken +{ + private string word; + + public IntegerToken(string word) + { + this.word = word; + } + + public string GetText() + { + return word; + } + + public int GetValue() + { + return int.Parse(word); + } + + public InputType GetInputType() + { + return InputType.Integer; + } +} diff --git a/TextParser/Tokenization/StringToken.cs b/TextParser/Tokenization/StringToken.cs new file mode 100644 index 0000000..1752ce9 --- /dev/null +++ b/TextParser/Tokenization/StringToken.cs @@ -0,0 +1,28 @@ +namespace Parsing.Tokenization; + +using Parsing.Schema; + +public class StringToken : IValueToken +{ + private string word; + + public StringToken(string word) + { + this.word = word; + } + + public string GetText() + { + return word; + } + + public string GetValue() + { + return word; + } + + public InputType GetInputType() + { + return InputType.String; + } +}