feat: implement greedy repetition, ref: A24-13
Some checks failed
CI / linter (9.0.X, ubuntu-latest) (push) Failing after 2m6s
SonarQube Scan / SonarQube Trigger (push) Successful in 2m7s
CI / tests_linux (9.0.X, ubuntu-latest) (push) Has been skipped

This commit is contained in:
Simon Diesenreiter 2024-12-02 15:30:07 +01:00
parent e15190ecd6
commit 0d85132a32
11 changed files with 196 additions and 8 deletions

View File

@ -12,6 +12,9 @@ public class TextParserTests
private const string testInput3 = @"2 4 6 1
3 5 7 2
4 6 8 3";
private const string testInput4 = @"2 ab ba fd er sd
8 cd dc
7 uh 6 yp rt";
[Fact]
public void LineParser_TestSimpleRepetition()
@ -157,4 +160,43 @@ public class TextParserTests
Assert.Equal(2, columns[3][1]);
Assert.Equal(3, columns[3][2]);
}
[Fact]
public void TextParser_TestGreedyRepetitionAsRows()
{
var schemaBuilder = new InputSchemaBuilder();
var schema = schemaBuilder
.Repeat()
.Expect(InputType.Integer)
.Repeat()
.Expect(InputType.String)
.EndRepetition()
.EndRepetition()
.Build();
var parser = new TextParser(schema);
var rows = parser
.SetInputText(testInput4)
.Parse()
.AsRawData();
Assert.Equal(3, rows.Count);
Assert.Equal(6, rows[0].Count);
Assert.Equal(3, rows[1].Count);
Assert.Equal(5, rows[2].Count);
Assert.Equal(InputType.Integer, rows[0][0].GetInputType());
Assert.Equal(InputType.String, rows[0][1].GetInputType());
Assert.Equal(InputType.String, rows[0][2].GetInputType());
Assert.Equal(InputType.String, rows[0][3].GetInputType());
Assert.Equal(InputType.String, rows[0][4].GetInputType());
Assert.Equal(InputType.String, rows[0][5].GetInputType());
Assert.Equal(InputType.Integer, rows[1][0].GetInputType());
Assert.Equal(InputType.String, rows[1][1].GetInputType());
Assert.Equal(InputType.String, rows[1][2].GetInputType());
Assert.Equal(InputType.Integer, rows[2][0].GetInputType());
Assert.Equal(InputType.String, rows[2][1].GetInputType());
Assert.Equal(InputType.Integer, rows[2][2].GetInputType());
Assert.Equal(InputType.String, rows[2][3].GetInputType());
Assert.Equal(InputType.String, rows[2][4].GetInputType());
}
}

View File

@ -12,6 +12,8 @@ abstract class BuildingBlockBase : IBuildingBlock
public abstract bool CanParseWord(InputProvider inputs);
public abstract bool CanParseWord(string word);
public abstract BlockType GetBlockType();
public virtual bool IsRepetitionType()
@ -19,7 +21,7 @@ abstract class BuildingBlockBase : IBuildingBlock
return false;
}
public virtual bool CheckIsDoneParsingAndReset()
public virtual bool CheckIsDoneParsingAndReset(InputProvider inputs)
{
// most blocks are always done parsing after consuming a token
// repetition blocks can consume multiple tokens

View File

@ -48,6 +48,21 @@ class FixedRepetitionBlock : BuildingBlockBase
return result;
}
public override bool CanParseWord(string word)
{
bool result;
if (this.repetitionCount == 0)
{
result = false;
}
else
{
result = inputSchema.CanProcessNextWord(context, word);
}
return result;
}
public override BlockType GetBlockType()
{
return BlockType.FixedRepetition;
@ -58,7 +73,7 @@ class FixedRepetitionBlock : BuildingBlockBase
return true;
}
public override bool CheckIsDoneParsingAndReset()
public override bool CheckIsDoneParsingAndReset(InputProvider inputs)
{
// we are done parsing once all repetitions are exhausted
var result = this.repetitionCount == 0;

View File

@ -0,0 +1,57 @@
namespace Parsing.Schema.BuildingBlocks;
using System.IO.Pipelines;
using Parsing.Tokenization;
class GreedyRepetitionBlock : BuildingBlockBase
{
private InputSchema inputSchema;
private InputSchemaContext context;
public GreedyRepetitionBlock(InputSchema inputSchema)
{
this.inputSchema = inputSchema;
this.context = this.inputSchema.CreateContext();
}
public override IToken ParseWord(InputProvider inputs)
{
var result = inputSchema.ProcessNextWord(context, inputs);
if (!this.CanParseWord(inputs))
{
this.context = this.inputSchema.CreateContext();
}
return result;
}
public override bool CanParseWord(InputProvider inputs)
{
return inputSchema.CanProcessNextWord(context, inputs) && inputs.CanYieldWord();
}
public override bool CanParseWord(string word)
{
return inputSchema.CanProcessNextWord(context, word);
}
public override BlockType GetBlockType()
{
return BlockType.GreedyRepetition;
}
public override bool IsRepetitionType()
{
return true;
}
public override bool CheckIsDoneParsingAndReset(InputProvider inputs)
{
// we are done parsing greedily once the next token doesn't match anymore
var result = !this.CanParseWord(inputs);
if (result)
{
this.context = this.inputSchema.CreateContext();
}
return result;
}
}

View File

@ -8,9 +8,11 @@ public interface IBuildingBlock
public bool CanParseWord(InputProvider inputs);
public bool CanParseWord(string word);
public BlockType GetBlockType();
public bool IsRepetitionType();
public bool CheckIsDoneParsingAndReset();
public bool CheckIsDoneParsingAndReset(InputProvider inputs);
}

View File

@ -18,12 +18,16 @@ class IntegerBlock : BuildingBlockBase
{
using (inputs.GetLookaheadContext())
{
int number = 0;
var success = int.TryParse(inputs.YieldWord(), out number);
return success;
return this.CanParseWord(inputs.YieldWord());
}
}
public override bool CanParseWord(string word)
{
int number = 0;
return int.TryParse(word, out number);
}
public override BlockType GetBlockType()
{
return BlockType.Integer;

View File

@ -16,6 +16,24 @@ class StringBlock : BuildingBlockBase
public override bool CanParseWord(InputProvider inputs)
{
string word = string.Empty;
using (inputs.GetLookaheadContext())
{
word = inputs.YieldWord();
}
return this.CanParseWord(word);
}
public override bool CanParseWord(string word)
{
// Here we need to ensure we are not matching any non-string tokens, since string can match pretty much anything
IntegerBlock intBlock = new IntegerBlock();
if(intBlock.CanParseWord(word))
{
return false;
}
return true;
}

View File

@ -29,7 +29,7 @@ public class InputSchema
{
var nextBlock = this.buildingBlocks[currentContext.lastProcessedBlockIndex];
var token = nextBlock.ParseWord(inputs);
if (!nextBlock.IsRepetitionType() || nextBlock.CheckIsDoneParsingAndReset())
if (!nextBlock.IsRepetitionType() || nextBlock.CheckIsDoneParsingAndReset(inputs))
{
currentContext.lastProcessedBlockIndex++;
currentContext.HasFinished = currentContext.lastProcessedBlockIndex >= this.buildingBlocks.Count;
@ -50,6 +50,16 @@ public class InputSchema
}
}
public bool CanProcessNextWord(InputSchemaContext currentContext, string word)
{
if (currentContext.HasFinished)
{
return false;
}
var nextBlock = this.buildingBlocks[currentContext.lastProcessedBlockIndex];
return nextBlock.CanParseWord(word);
}
public List<IToken> ProcessWordList(string[] words)
{
List<IToken> tokens = new List<IToken>();

View File

@ -38,6 +38,15 @@ public class InputSchemaBuilder
return newInputSchemaBuilder;
}
public InputSchemaBuilder Repeat()
{
// add another layer of parsing
var newInputSchemaBuilder = new RepetitionSchemaBuilder(this);
newInputSchemaBuilder.RepetitionType = RepetitionType.GreedyRepetition;
return newInputSchemaBuilder;
}
public InputSchemaBuilder EndRepetition()
{
// return back to upper layer of parsing
@ -54,6 +63,9 @@ public class InputSchemaBuilder
case RepetitionType.FixedRepetition:
oldInputSchemaBuilder.schema.AddBuildingBlock(new FixedRepetitionBlock(currentSchema, currentBuilder.NumRepetition));
break;
case RepetitionType.GreedyRepetition:
oldInputSchemaBuilder.schema.AddBuildingBlock(new GreedyRepetitionBlock(currentSchema));
break;
default:
throw new Exception("Unrecognized RepetitionType");
}

View File

@ -38,6 +38,20 @@ public class TokenConverter
return returnData;
}
private void CheckConversionPrerequisites()
{
// in order to convert rows to columns or grid we require every row to have the same length
int rowLength = this.rawTokens[0].Count;
foreach(var tokenRow in this.rawTokens)
{
if(tokenRow.Count != rowLength)
{
throw new Exception("Attempted to convert token dataset that is not able to be converted!");
}
}
}
public List<T[]> AsRows<T>()
{
var listRows = this.AsListRows<T>();
@ -71,6 +85,7 @@ public class TokenConverter
public List<List<T>> AsListColumns<T>()
{
this.CheckConversionPrerequisites();
var rows = AsListRows<T>();
var columns = new List<List<T>>();
@ -92,7 +107,13 @@ public class TokenConverter
public T[][] AsGrid<T>()
{
this.CheckConversionPrerequisites();
var rowsList = AsRows<T>();
return rowsList.ToArray();
}
public List<List<IToken>> AsRawData()
{
return this.rawTokens;
}
}

View File

@ -37,9 +37,14 @@ public class InputProvider
return new InputProvider.LookaheadContext(this);
}
public bool CanYieldWord()
{
return this.CurrentPosition < this.words.Length;
}
public string YieldWord()
{
if (this.CurrentPosition > this.words.Length)
if (!this.CanYieldWord())
{
return string.Empty;
}