Looking for an algorithm to identify contiguous repeated series of lines in a long string
I would like an algorithm that can identify repeated parts of big stack traces like this:
java.lang.StackOverflowError
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
With a bit of inspection, it's clear that this segment is being repeated three times:
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
The end goal is so I can print out stack traces of recursive functions in a nicer fashion
java.lang.StackOverflowError
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
------------------------- Repeated 3x -------------------------
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
-------------------------------------------------------------
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
I'm not sure how feasible this is, but I'd be happy for any and all possible solutions even if they have their own limitations and constraints. Preliminary Googling didn't find me anything, but quite likely I just don't know what to google
algorithms
New contributor
add a comment |
I would like an algorithm that can identify repeated parts of big stack traces like this:
java.lang.StackOverflowError
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
With a bit of inspection, it's clear that this segment is being repeated three times:
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
The end goal is so I can print out stack traces of recursive functions in a nicer fashion
java.lang.StackOverflowError
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
------------------------- Repeated 3x -------------------------
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
-------------------------------------------------------------
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
I'm not sure how feasible this is, but I'd be happy for any and all possible solutions even if they have their own limitations and constraints. Preliminary Googling didn't find me anything, but quite likely I just don't know what to google
algorithms
New contributor
add a comment |
I would like an algorithm that can identify repeated parts of big stack traces like this:
java.lang.StackOverflowError
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
With a bit of inspection, it's clear that this segment is being repeated three times:
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
The end goal is so I can print out stack traces of recursive functions in a nicer fashion
java.lang.StackOverflowError
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
------------------------- Repeated 3x -------------------------
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
-------------------------------------------------------------
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
I'm not sure how feasible this is, but I'd be happy for any and all possible solutions even if they have their own limitations and constraints. Preliminary Googling didn't find me anything, but quite likely I just don't know what to google
algorithms
New contributor
I would like an algorithm that can identify repeated parts of big stack traces like this:
java.lang.StackOverflowError
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
With a bit of inspection, it's clear that this segment is being repeated three times:
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
The end goal is so I can print out stack traces of recursive functions in a nicer fashion
java.lang.StackOverflowError
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
------------------------- Repeated 3x -------------------------
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
at typechecker.Typers$Typer.silent(Typers.scala:700)
at typechecker.Typers$Typer.normalTypedApply$1(Typers.scala:4754)
at typechecker.Typers$Typer.typedApply$1(Typers.scala:4801)
at typechecker.Typers$Typer.typedInAnyMode$1(Typers.scala:5586)
at typechecker.Typers$Typer.typed1(Typers.scala:5603)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5723)
at typechecker.Typers$Typer.typedQualifier(Typers.scala:5731)
at transform.Erasure$Eraser.adaptMember(Erasure.scala:714)
at transform.Erasure$Eraser.typed1(Erasure.scala:789)
-------------------------------------------------------------
at typechecker.Typers$Typer.runTyper$1(Typers.scala:5640)
at typechecker.Typers$Typer.typedInternal(Typers.scala:5672)
at typechecker.Typers$Typer.body$2(Typers.scala:5613)
at typechecker.Typers$Typer.typed(Typers.scala:5618)
at typechecker.Typers$Typer.$anonfun$typed1$38(Typers.scala:4752)
I'm not sure how feasible this is, but I'd be happy for any and all possible solutions even if they have their own limitations and constraints. Preliminary Googling didn't find me anything, but quite likely I just don't know what to google
algorithms
algorithms
New contributor
New contributor
edited yesterday
Li Haoyi
New contributor
asked yesterday
Li HaoyiLi Haoyi
1314
1314
New contributor
New contributor
add a comment |
add a comment |
5 Answers
5
active
oldest
votes
Build suffix tree using Ukkonen's algorithm, this way in $mathcal O(n)$ you will find all substrings in provided text with indices.
In the case of approximate matching, there is also extended version of Ukkonen's algorithm.
add a comment |
If you consider that “a line of stacktrace” = “a character”, you can use:
http://en.wikipedia.org/wiki/Longest_repeated_substring_problem
One way to solve this problem efficiently is by constructing a Suffix Trie: http://en.wikipedia.org/wiki/Suffix_tree
Every time you traverse an already estabilished path you are actually discovering a repetition in your string.
Now, simply list all the repetions in a separate data structure and extract the longest one... or all if you want.
New contributor
1
The longest repeated substring problem appears to also find substrings which overlap, or are repeated with gaps between them. Is there a way to make it only find substrings which are repeated consecutively without gaps or overlap?
– Li Haoyi
yesterday
add a comment |
An efficient string factorization algorithm may help.
Given a string $S$, $n = |S|$ find maximum $p$ such that $S = T^p$ e.g. $T$ concatenated $p$ times, we call $T$ the seed and p the period. This can be done in $O(n)$ using the prefix function of the (Knuth-)Morris-Pratt algorithm, but don't be scared by the name it's very simple and beautiful algorithm, here's my solution for the corresponding problem SPOJ PERIOD.
Armed with a fast string factorization we can then proceed to decompose a string as the concatenation of factored chunks (dynamic programming) using a custom cost function e.g. minimize the total length of the seeds, minimize the sum of squares of the seeds' lengths...
$S = {T_1}^{p_1}{T_2}^{p_2}cdots{T_k}^{p_k}$
Total complexity is $O(n^2)$.
If $O(n^2)$ is way too costly you can shift to a greedy like strategy e.g. as soon as you find a factorizable chunk, squeeze it and continue from that point, and also limit the maximum seed size (e.g. $|T|le 200$) and prune the search if you don't find a period $ge 2$ in the first e.g. 400 characters.
New contributor
add a comment |
Under normal circumstances, JVM fills only the last 1024 calls in a stacktrace, and in Dotty/Scalac most stackoverflows have a repeating fragment of length ≈ 70 or less. A stacktrace T of a StackOverflowException can be decomposed into three parts S · R^N · P, where R is the repeating part of the stacktrace, S is some suffix of R, and P is either some prefix of R or an unrelated sequence of calls. We are interested in a solution such that the total length C = |S · R^N| of the repeating part and N are both maximal, and |S| is minimal.
// Scala Pseudocode (beware of for comprehensions)
//
// Stack is assumed to be in reverse order,
// most recent stack frame is last.
val stack: Array[StackTraceElement]
val F: Int // Maximum size of R.
val candidates = for {
// Enumerate all possible suffixes S.
S <- ∀ prefix of stack
if |S| < F
// Remove the suffix from the stack,
R <- ∀ non-empty prefix of stack.drop(|S|)
// Find a fragment that ends with S.
if R.endsWith(S)
// Find out how many fragments fit into the stack.
// (how many times we can remove R from the stack)
N = coverSize(R, stack.drop(|S|))
if N >= 2 // Or higher.
} yield (S, R, N)
// Best cover has maximum coverage,
// minimum fragment length,
// and minimum suffix length.
val bestCandidate = candidates.maxBy { (S, R, N) =>
val C = |S| + |R| * N
return (C, -|R|, -|S|)
}
The entire algorithm can be implemented in a way that does not allocate any memory (to handle OOM). It has complexity O(F^2 |T|), but exceptions are rare enough and this is a small constant (F << 1024, T = 1024).
I have implemented this exact algorithm in my library https://github.com/alexknvl/tracehash (https://github.com/alexknvl/tracehash/blob/master/src/main/java/tracehash/internal/SOCoverSolver.java) for the same purpose of simplifying scalac/dotc errors ;)
EDIT: Here is an implementation of the same algorithm in Python:
stack = list(reversed([3, 4, 2, 1, 2, 1, 2, 1, 2, 1, 2]))
F = 6
results =
for slen in range(0, F + 1):
suffix, stack1 = stack[:slen], stack[slen:]
for flen in range(1, F + 1):
fragment = stack1[:flen]
if fragment[flen - slen:] != suffix:
continue
stack2 = stack1[:]
n = 0
while stack2[:flen] == fragment:
stack2 = stack2[flen:]
n += 1
if n >= 2: # A heuristic, might want to set it a bit higher.
results.append((slen, flen, n))
def cost(t):
s, r, n = t
c = s + r * n
return (c, -r, -s)
S, R, N = max(results, key=cost)
print('%s · %s^%d · %s' % (stack[:S], stack[S:S+R], N, stack[S+R*N:]))
# Prints · [2, 1]^4 · [2, 4, 3]
EDIT2: Following some of the ideas from mukel's answer, here is a function https://gist.github.com/alexknvl/b041099eb5347d728e2dacd1e8caed8c that solves something along the lines of:
stack = a[1]^k[1] · a[2]^k[2] · ...
argmax (sum |a[i]| * k[i] where k[i] >= 2,
-sum |a[i]| where k[i] >= 2,
-sum |a[i]| where k[i] == 1)
It is greedy so it is not necessarily an optimal solution, but it seems to work reasonably well in simple cases, e.g. given
stack = list(reversed([
3, 4, 2,
1, 2, 1, 2, 1, 2, 1, 2,
5,
4, 5, 4, 5, 4, 5,
3, 3, 3, 3]))
it produces an answer
[([3], 4), ([5, 4], 3), ([5], 1), ([2, 1], 4), ([2, 4, 3], 1)]
New contributor
"The entire algorithm can be implemented in a way that does not allocate any memory." Do you mean that the algorithm can be implemented so that it only uses stack memory? Why is it useful to use stack memory only?
– Apass.Jack
12 hours ago
1
You might want to avoid allocating anything on the heap in case you catch an OutOfMemoryError.
– alex.knvl
12 hours ago
I see. Well, it happens the error in the question is StackOverflowError.
– Apass.Jack
32 mins ago
add a comment |
string contiguous_repeated_string(string A){
string r(1,A[0]);
for(int i=0;i<A.size();i++){
for(int l=2;l<=A.size();l++){
string s=A.substr(i,l);
if ((s+s).substr(1,2*s.size()-1).find(s)!=s.size()-1 and s.size()>r.size()){
r=s;
}
}
}
return r;
}
Call contiguous_repeated_string("abcdefgabcdabcdabcd")
you will get abcdabcdabcd
.
Call contiguous_repeated_string("ATCGATCGA")
you will get ATCGATCG
.
And then you can use KMP's Longest Proper Prefix Postfix array
to fold the resulting string with O(N)
.
Total time complexity is O(N^2)
.
New contributor
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["$", "$"], ["\\(","\\)"]]);
});
});
}, "mathjax-editing");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "419"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Li Haoyi is a new contributor. Be nice, and check out our Code of Conduct.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcs.stackexchange.com%2fquestions%2f102463%2flooking-for-an-algorithm-to-identify-contiguous-repeated-series-of-lines-in-a-lo%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
5 Answers
5
active
oldest
votes
5 Answers
5
active
oldest
votes
active
oldest
votes
active
oldest
votes
Build suffix tree using Ukkonen's algorithm, this way in $mathcal O(n)$ you will find all substrings in provided text with indices.
In the case of approximate matching, there is also extended version of Ukkonen's algorithm.
add a comment |
Build suffix tree using Ukkonen's algorithm, this way in $mathcal O(n)$ you will find all substrings in provided text with indices.
In the case of approximate matching, there is also extended version of Ukkonen's algorithm.
add a comment |
Build suffix tree using Ukkonen's algorithm, this way in $mathcal O(n)$ you will find all substrings in provided text with indices.
In the case of approximate matching, there is also extended version of Ukkonen's algorithm.
Build suffix tree using Ukkonen's algorithm, this way in $mathcal O(n)$ you will find all substrings in provided text with indices.
In the case of approximate matching, there is also extended version of Ukkonen's algorithm.
answered yesterday
EvilEvil
7,61842245
7,61842245
add a comment |
add a comment |
If you consider that “a line of stacktrace” = “a character”, you can use:
http://en.wikipedia.org/wiki/Longest_repeated_substring_problem
One way to solve this problem efficiently is by constructing a Suffix Trie: http://en.wikipedia.org/wiki/Suffix_tree
Every time you traverse an already estabilished path you are actually discovering a repetition in your string.
Now, simply list all the repetions in a separate data structure and extract the longest one... or all if you want.
New contributor
1
The longest repeated substring problem appears to also find substrings which overlap, or are repeated with gaps between them. Is there a way to make it only find substrings which are repeated consecutively without gaps or overlap?
– Li Haoyi
yesterday
add a comment |
If you consider that “a line of stacktrace” = “a character”, you can use:
http://en.wikipedia.org/wiki/Longest_repeated_substring_problem
One way to solve this problem efficiently is by constructing a Suffix Trie: http://en.wikipedia.org/wiki/Suffix_tree
Every time you traverse an already estabilished path you are actually discovering a repetition in your string.
Now, simply list all the repetions in a separate data structure and extract the longest one... or all if you want.
New contributor
1
The longest repeated substring problem appears to also find substrings which overlap, or are repeated with gaps between them. Is there a way to make it only find substrings which are repeated consecutively without gaps or overlap?
– Li Haoyi
yesterday
add a comment |
If you consider that “a line of stacktrace” = “a character”, you can use:
http://en.wikipedia.org/wiki/Longest_repeated_substring_problem
One way to solve this problem efficiently is by constructing a Suffix Trie: http://en.wikipedia.org/wiki/Suffix_tree
Every time you traverse an already estabilished path you are actually discovering a repetition in your string.
Now, simply list all the repetions in a separate data structure and extract the longest one... or all if you want.
New contributor
If you consider that “a line of stacktrace” = “a character”, you can use:
http://en.wikipedia.org/wiki/Longest_repeated_substring_problem
One way to solve this problem efficiently is by constructing a Suffix Trie: http://en.wikipedia.org/wiki/Suffix_tree
Every time you traverse an already estabilished path you are actually discovering a repetition in your string.
Now, simply list all the repetions in a separate data structure and extract the longest one... or all if you want.
New contributor
New contributor
answered yesterday
n1r3n1r3
1211
1211
New contributor
New contributor
1
The longest repeated substring problem appears to also find substrings which overlap, or are repeated with gaps between them. Is there a way to make it only find substrings which are repeated consecutively without gaps or overlap?
– Li Haoyi
yesterday
add a comment |
1
The longest repeated substring problem appears to also find substrings which overlap, or are repeated with gaps between them. Is there a way to make it only find substrings which are repeated consecutively without gaps or overlap?
– Li Haoyi
yesterday
1
1
The longest repeated substring problem appears to also find substrings which overlap, or are repeated with gaps between them. Is there a way to make it only find substrings which are repeated consecutively without gaps or overlap?
– Li Haoyi
yesterday
The longest repeated substring problem appears to also find substrings which overlap, or are repeated with gaps between them. Is there a way to make it only find substrings which are repeated consecutively without gaps or overlap?
– Li Haoyi
yesterday
add a comment |
An efficient string factorization algorithm may help.
Given a string $S$, $n = |S|$ find maximum $p$ such that $S = T^p$ e.g. $T$ concatenated $p$ times, we call $T$ the seed and p the period. This can be done in $O(n)$ using the prefix function of the (Knuth-)Morris-Pratt algorithm, but don't be scared by the name it's very simple and beautiful algorithm, here's my solution for the corresponding problem SPOJ PERIOD.
Armed with a fast string factorization we can then proceed to decompose a string as the concatenation of factored chunks (dynamic programming) using a custom cost function e.g. minimize the total length of the seeds, minimize the sum of squares of the seeds' lengths...
$S = {T_1}^{p_1}{T_2}^{p_2}cdots{T_k}^{p_k}$
Total complexity is $O(n^2)$.
If $O(n^2)$ is way too costly you can shift to a greedy like strategy e.g. as soon as you find a factorizable chunk, squeeze it and continue from that point, and also limit the maximum seed size (e.g. $|T|le 200$) and prune the search if you don't find a period $ge 2$ in the first e.g. 400 characters.
New contributor
add a comment |
An efficient string factorization algorithm may help.
Given a string $S$, $n = |S|$ find maximum $p$ such that $S = T^p$ e.g. $T$ concatenated $p$ times, we call $T$ the seed and p the period. This can be done in $O(n)$ using the prefix function of the (Knuth-)Morris-Pratt algorithm, but don't be scared by the name it's very simple and beautiful algorithm, here's my solution for the corresponding problem SPOJ PERIOD.
Armed with a fast string factorization we can then proceed to decompose a string as the concatenation of factored chunks (dynamic programming) using a custom cost function e.g. minimize the total length of the seeds, minimize the sum of squares of the seeds' lengths...
$S = {T_1}^{p_1}{T_2}^{p_2}cdots{T_k}^{p_k}$
Total complexity is $O(n^2)$.
If $O(n^2)$ is way too costly you can shift to a greedy like strategy e.g. as soon as you find a factorizable chunk, squeeze it and continue from that point, and also limit the maximum seed size (e.g. $|T|le 200$) and prune the search if you don't find a period $ge 2$ in the first e.g. 400 characters.
New contributor
add a comment |
An efficient string factorization algorithm may help.
Given a string $S$, $n = |S|$ find maximum $p$ such that $S = T^p$ e.g. $T$ concatenated $p$ times, we call $T$ the seed and p the period. This can be done in $O(n)$ using the prefix function of the (Knuth-)Morris-Pratt algorithm, but don't be scared by the name it's very simple and beautiful algorithm, here's my solution for the corresponding problem SPOJ PERIOD.
Armed with a fast string factorization we can then proceed to decompose a string as the concatenation of factored chunks (dynamic programming) using a custom cost function e.g. minimize the total length of the seeds, minimize the sum of squares of the seeds' lengths...
$S = {T_1}^{p_1}{T_2}^{p_2}cdots{T_k}^{p_k}$
Total complexity is $O(n^2)$.
If $O(n^2)$ is way too costly you can shift to a greedy like strategy e.g. as soon as you find a factorizable chunk, squeeze it and continue from that point, and also limit the maximum seed size (e.g. $|T|le 200$) and prune the search if you don't find a period $ge 2$ in the first e.g. 400 characters.
New contributor
An efficient string factorization algorithm may help.
Given a string $S$, $n = |S|$ find maximum $p$ such that $S = T^p$ e.g. $T$ concatenated $p$ times, we call $T$ the seed and p the period. This can be done in $O(n)$ using the prefix function of the (Knuth-)Morris-Pratt algorithm, but don't be scared by the name it's very simple and beautiful algorithm, here's my solution for the corresponding problem SPOJ PERIOD.
Armed with a fast string factorization we can then proceed to decompose a string as the concatenation of factored chunks (dynamic programming) using a custom cost function e.g. minimize the total length of the seeds, minimize the sum of squares of the seeds' lengths...
$S = {T_1}^{p_1}{T_2}^{p_2}cdots{T_k}^{p_k}$
Total complexity is $O(n^2)$.
If $O(n^2)$ is way too costly you can shift to a greedy like strategy e.g. as soon as you find a factorizable chunk, squeeze it and continue from that point, and also limit the maximum seed size (e.g. $|T|le 200$) and prune the search if you don't find a period $ge 2$ in the first e.g. 400 characters.
New contributor
edited 23 hours ago
Apass.Jack
7,4571633
7,4571633
New contributor
answered yesterday
mukelmukel
212
212
New contributor
New contributor
add a comment |
add a comment |
Under normal circumstances, JVM fills only the last 1024 calls in a stacktrace, and in Dotty/Scalac most stackoverflows have a repeating fragment of length ≈ 70 or less. A stacktrace T of a StackOverflowException can be decomposed into three parts S · R^N · P, where R is the repeating part of the stacktrace, S is some suffix of R, and P is either some prefix of R or an unrelated sequence of calls. We are interested in a solution such that the total length C = |S · R^N| of the repeating part and N are both maximal, and |S| is minimal.
// Scala Pseudocode (beware of for comprehensions)
//
// Stack is assumed to be in reverse order,
// most recent stack frame is last.
val stack: Array[StackTraceElement]
val F: Int // Maximum size of R.
val candidates = for {
// Enumerate all possible suffixes S.
S <- ∀ prefix of stack
if |S| < F
// Remove the suffix from the stack,
R <- ∀ non-empty prefix of stack.drop(|S|)
// Find a fragment that ends with S.
if R.endsWith(S)
// Find out how many fragments fit into the stack.
// (how many times we can remove R from the stack)
N = coverSize(R, stack.drop(|S|))
if N >= 2 // Or higher.
} yield (S, R, N)
// Best cover has maximum coverage,
// minimum fragment length,
// and minimum suffix length.
val bestCandidate = candidates.maxBy { (S, R, N) =>
val C = |S| + |R| * N
return (C, -|R|, -|S|)
}
The entire algorithm can be implemented in a way that does not allocate any memory (to handle OOM). It has complexity O(F^2 |T|), but exceptions are rare enough and this is a small constant (F << 1024, T = 1024).
I have implemented this exact algorithm in my library https://github.com/alexknvl/tracehash (https://github.com/alexknvl/tracehash/blob/master/src/main/java/tracehash/internal/SOCoverSolver.java) for the same purpose of simplifying scalac/dotc errors ;)
EDIT: Here is an implementation of the same algorithm in Python:
stack = list(reversed([3, 4, 2, 1, 2, 1, 2, 1, 2, 1, 2]))
F = 6
results =
for slen in range(0, F + 1):
suffix, stack1 = stack[:slen], stack[slen:]
for flen in range(1, F + 1):
fragment = stack1[:flen]
if fragment[flen - slen:] != suffix:
continue
stack2 = stack1[:]
n = 0
while stack2[:flen] == fragment:
stack2 = stack2[flen:]
n += 1
if n >= 2: # A heuristic, might want to set it a bit higher.
results.append((slen, flen, n))
def cost(t):
s, r, n = t
c = s + r * n
return (c, -r, -s)
S, R, N = max(results, key=cost)
print('%s · %s^%d · %s' % (stack[:S], stack[S:S+R], N, stack[S+R*N:]))
# Prints · [2, 1]^4 · [2, 4, 3]
EDIT2: Following some of the ideas from mukel's answer, here is a function https://gist.github.com/alexknvl/b041099eb5347d728e2dacd1e8caed8c that solves something along the lines of:
stack = a[1]^k[1] · a[2]^k[2] · ...
argmax (sum |a[i]| * k[i] where k[i] >= 2,
-sum |a[i]| where k[i] >= 2,
-sum |a[i]| where k[i] == 1)
It is greedy so it is not necessarily an optimal solution, but it seems to work reasonably well in simple cases, e.g. given
stack = list(reversed([
3, 4, 2,
1, 2, 1, 2, 1, 2, 1, 2,
5,
4, 5, 4, 5, 4, 5,
3, 3, 3, 3]))
it produces an answer
[([3], 4), ([5, 4], 3), ([5], 1), ([2, 1], 4), ([2, 4, 3], 1)]
New contributor
"The entire algorithm can be implemented in a way that does not allocate any memory." Do you mean that the algorithm can be implemented so that it only uses stack memory? Why is it useful to use stack memory only?
– Apass.Jack
12 hours ago
1
You might want to avoid allocating anything on the heap in case you catch an OutOfMemoryError.
– alex.knvl
12 hours ago
I see. Well, it happens the error in the question is StackOverflowError.
– Apass.Jack
32 mins ago
add a comment |
Under normal circumstances, JVM fills only the last 1024 calls in a stacktrace, and in Dotty/Scalac most stackoverflows have a repeating fragment of length ≈ 70 or less. A stacktrace T of a StackOverflowException can be decomposed into three parts S · R^N · P, where R is the repeating part of the stacktrace, S is some suffix of R, and P is either some prefix of R or an unrelated sequence of calls. We are interested in a solution such that the total length C = |S · R^N| of the repeating part and N are both maximal, and |S| is minimal.
// Scala Pseudocode (beware of for comprehensions)
//
// Stack is assumed to be in reverse order,
// most recent stack frame is last.
val stack: Array[StackTraceElement]
val F: Int // Maximum size of R.
val candidates = for {
// Enumerate all possible suffixes S.
S <- ∀ prefix of stack
if |S| < F
// Remove the suffix from the stack,
R <- ∀ non-empty prefix of stack.drop(|S|)
// Find a fragment that ends with S.
if R.endsWith(S)
// Find out how many fragments fit into the stack.
// (how many times we can remove R from the stack)
N = coverSize(R, stack.drop(|S|))
if N >= 2 // Or higher.
} yield (S, R, N)
// Best cover has maximum coverage,
// minimum fragment length,
// and minimum suffix length.
val bestCandidate = candidates.maxBy { (S, R, N) =>
val C = |S| + |R| * N
return (C, -|R|, -|S|)
}
The entire algorithm can be implemented in a way that does not allocate any memory (to handle OOM). It has complexity O(F^2 |T|), but exceptions are rare enough and this is a small constant (F << 1024, T = 1024).
I have implemented this exact algorithm in my library https://github.com/alexknvl/tracehash (https://github.com/alexknvl/tracehash/blob/master/src/main/java/tracehash/internal/SOCoverSolver.java) for the same purpose of simplifying scalac/dotc errors ;)
EDIT: Here is an implementation of the same algorithm in Python:
stack = list(reversed([3, 4, 2, 1, 2, 1, 2, 1, 2, 1, 2]))
F = 6
results =
for slen in range(0, F + 1):
suffix, stack1 = stack[:slen], stack[slen:]
for flen in range(1, F + 1):
fragment = stack1[:flen]
if fragment[flen - slen:] != suffix:
continue
stack2 = stack1[:]
n = 0
while stack2[:flen] == fragment:
stack2 = stack2[flen:]
n += 1
if n >= 2: # A heuristic, might want to set it a bit higher.
results.append((slen, flen, n))
def cost(t):
s, r, n = t
c = s + r * n
return (c, -r, -s)
S, R, N = max(results, key=cost)
print('%s · %s^%d · %s' % (stack[:S], stack[S:S+R], N, stack[S+R*N:]))
# Prints · [2, 1]^4 · [2, 4, 3]
EDIT2: Following some of the ideas from mukel's answer, here is a function https://gist.github.com/alexknvl/b041099eb5347d728e2dacd1e8caed8c that solves something along the lines of:
stack = a[1]^k[1] · a[2]^k[2] · ...
argmax (sum |a[i]| * k[i] where k[i] >= 2,
-sum |a[i]| where k[i] >= 2,
-sum |a[i]| where k[i] == 1)
It is greedy so it is not necessarily an optimal solution, but it seems to work reasonably well in simple cases, e.g. given
stack = list(reversed([
3, 4, 2,
1, 2, 1, 2, 1, 2, 1, 2,
5,
4, 5, 4, 5, 4, 5,
3, 3, 3, 3]))
it produces an answer
[([3], 4), ([5, 4], 3), ([5], 1), ([2, 1], 4), ([2, 4, 3], 1)]
New contributor
"The entire algorithm can be implemented in a way that does not allocate any memory." Do you mean that the algorithm can be implemented so that it only uses stack memory? Why is it useful to use stack memory only?
– Apass.Jack
12 hours ago
1
You might want to avoid allocating anything on the heap in case you catch an OutOfMemoryError.
– alex.knvl
12 hours ago
I see. Well, it happens the error in the question is StackOverflowError.
– Apass.Jack
32 mins ago
add a comment |
Under normal circumstances, JVM fills only the last 1024 calls in a stacktrace, and in Dotty/Scalac most stackoverflows have a repeating fragment of length ≈ 70 or less. A stacktrace T of a StackOverflowException can be decomposed into three parts S · R^N · P, where R is the repeating part of the stacktrace, S is some suffix of R, and P is either some prefix of R or an unrelated sequence of calls. We are interested in a solution such that the total length C = |S · R^N| of the repeating part and N are both maximal, and |S| is minimal.
// Scala Pseudocode (beware of for comprehensions)
//
// Stack is assumed to be in reverse order,
// most recent stack frame is last.
val stack: Array[StackTraceElement]
val F: Int // Maximum size of R.
val candidates = for {
// Enumerate all possible suffixes S.
S <- ∀ prefix of stack
if |S| < F
// Remove the suffix from the stack,
R <- ∀ non-empty prefix of stack.drop(|S|)
// Find a fragment that ends with S.
if R.endsWith(S)
// Find out how many fragments fit into the stack.
// (how many times we can remove R from the stack)
N = coverSize(R, stack.drop(|S|))
if N >= 2 // Or higher.
} yield (S, R, N)
// Best cover has maximum coverage,
// minimum fragment length,
// and minimum suffix length.
val bestCandidate = candidates.maxBy { (S, R, N) =>
val C = |S| + |R| * N
return (C, -|R|, -|S|)
}
The entire algorithm can be implemented in a way that does not allocate any memory (to handle OOM). It has complexity O(F^2 |T|), but exceptions are rare enough and this is a small constant (F << 1024, T = 1024).
I have implemented this exact algorithm in my library https://github.com/alexknvl/tracehash (https://github.com/alexknvl/tracehash/blob/master/src/main/java/tracehash/internal/SOCoverSolver.java) for the same purpose of simplifying scalac/dotc errors ;)
EDIT: Here is an implementation of the same algorithm in Python:
stack = list(reversed([3, 4, 2, 1, 2, 1, 2, 1, 2, 1, 2]))
F = 6
results =
for slen in range(0, F + 1):
suffix, stack1 = stack[:slen], stack[slen:]
for flen in range(1, F + 1):
fragment = stack1[:flen]
if fragment[flen - slen:] != suffix:
continue
stack2 = stack1[:]
n = 0
while stack2[:flen] == fragment:
stack2 = stack2[flen:]
n += 1
if n >= 2: # A heuristic, might want to set it a bit higher.
results.append((slen, flen, n))
def cost(t):
s, r, n = t
c = s + r * n
return (c, -r, -s)
S, R, N = max(results, key=cost)
print('%s · %s^%d · %s' % (stack[:S], stack[S:S+R], N, stack[S+R*N:]))
# Prints · [2, 1]^4 · [2, 4, 3]
EDIT2: Following some of the ideas from mukel's answer, here is a function https://gist.github.com/alexknvl/b041099eb5347d728e2dacd1e8caed8c that solves something along the lines of:
stack = a[1]^k[1] · a[2]^k[2] · ...
argmax (sum |a[i]| * k[i] where k[i] >= 2,
-sum |a[i]| where k[i] >= 2,
-sum |a[i]| where k[i] == 1)
It is greedy so it is not necessarily an optimal solution, but it seems to work reasonably well in simple cases, e.g. given
stack = list(reversed([
3, 4, 2,
1, 2, 1, 2, 1, 2, 1, 2,
5,
4, 5, 4, 5, 4, 5,
3, 3, 3, 3]))
it produces an answer
[([3], 4), ([5, 4], 3), ([5], 1), ([2, 1], 4), ([2, 4, 3], 1)]
New contributor
Under normal circumstances, JVM fills only the last 1024 calls in a stacktrace, and in Dotty/Scalac most stackoverflows have a repeating fragment of length ≈ 70 or less. A stacktrace T of a StackOverflowException can be decomposed into three parts S · R^N · P, where R is the repeating part of the stacktrace, S is some suffix of R, and P is either some prefix of R or an unrelated sequence of calls. We are interested in a solution such that the total length C = |S · R^N| of the repeating part and N are both maximal, and |S| is minimal.
// Scala Pseudocode (beware of for comprehensions)
//
// Stack is assumed to be in reverse order,
// most recent stack frame is last.
val stack: Array[StackTraceElement]
val F: Int // Maximum size of R.
val candidates = for {
// Enumerate all possible suffixes S.
S <- ∀ prefix of stack
if |S| < F
// Remove the suffix from the stack,
R <- ∀ non-empty prefix of stack.drop(|S|)
// Find a fragment that ends with S.
if R.endsWith(S)
// Find out how many fragments fit into the stack.
// (how many times we can remove R from the stack)
N = coverSize(R, stack.drop(|S|))
if N >= 2 // Or higher.
} yield (S, R, N)
// Best cover has maximum coverage,
// minimum fragment length,
// and minimum suffix length.
val bestCandidate = candidates.maxBy { (S, R, N) =>
val C = |S| + |R| * N
return (C, -|R|, -|S|)
}
The entire algorithm can be implemented in a way that does not allocate any memory (to handle OOM). It has complexity O(F^2 |T|), but exceptions are rare enough and this is a small constant (F << 1024, T = 1024).
I have implemented this exact algorithm in my library https://github.com/alexknvl/tracehash (https://github.com/alexknvl/tracehash/blob/master/src/main/java/tracehash/internal/SOCoverSolver.java) for the same purpose of simplifying scalac/dotc errors ;)
EDIT: Here is an implementation of the same algorithm in Python:
stack = list(reversed([3, 4, 2, 1, 2, 1, 2, 1, 2, 1, 2]))
F = 6
results =
for slen in range(0, F + 1):
suffix, stack1 = stack[:slen], stack[slen:]
for flen in range(1, F + 1):
fragment = stack1[:flen]
if fragment[flen - slen:] != suffix:
continue
stack2 = stack1[:]
n = 0
while stack2[:flen] == fragment:
stack2 = stack2[flen:]
n += 1
if n >= 2: # A heuristic, might want to set it a bit higher.
results.append((slen, flen, n))
def cost(t):
s, r, n = t
c = s + r * n
return (c, -r, -s)
S, R, N = max(results, key=cost)
print('%s · %s^%d · %s' % (stack[:S], stack[S:S+R], N, stack[S+R*N:]))
# Prints · [2, 1]^4 · [2, 4, 3]
EDIT2: Following some of the ideas from mukel's answer, here is a function https://gist.github.com/alexknvl/b041099eb5347d728e2dacd1e8caed8c that solves something along the lines of:
stack = a[1]^k[1] · a[2]^k[2] · ...
argmax (sum |a[i]| * k[i] where k[i] >= 2,
-sum |a[i]| where k[i] >= 2,
-sum |a[i]| where k[i] == 1)
It is greedy so it is not necessarily an optimal solution, but it seems to work reasonably well in simple cases, e.g. given
stack = list(reversed([
3, 4, 2,
1, 2, 1, 2, 1, 2, 1, 2,
5,
4, 5, 4, 5, 4, 5,
3, 3, 3, 3]))
it produces an answer
[([3], 4), ([5, 4], 3), ([5], 1), ([2, 1], 4), ([2, 4, 3], 1)]
New contributor
edited 10 hours ago
New contributor
answered 15 hours ago
alex.knvlalex.knvl
212
212
New contributor
New contributor
"The entire algorithm can be implemented in a way that does not allocate any memory." Do you mean that the algorithm can be implemented so that it only uses stack memory? Why is it useful to use stack memory only?
– Apass.Jack
12 hours ago
1
You might want to avoid allocating anything on the heap in case you catch an OutOfMemoryError.
– alex.knvl
12 hours ago
I see. Well, it happens the error in the question is StackOverflowError.
– Apass.Jack
32 mins ago
add a comment |
"The entire algorithm can be implemented in a way that does not allocate any memory." Do you mean that the algorithm can be implemented so that it only uses stack memory? Why is it useful to use stack memory only?
– Apass.Jack
12 hours ago
1
You might want to avoid allocating anything on the heap in case you catch an OutOfMemoryError.
– alex.knvl
12 hours ago
I see. Well, it happens the error in the question is StackOverflowError.
– Apass.Jack
32 mins ago
"The entire algorithm can be implemented in a way that does not allocate any memory." Do you mean that the algorithm can be implemented so that it only uses stack memory? Why is it useful to use stack memory only?
– Apass.Jack
12 hours ago
"The entire algorithm can be implemented in a way that does not allocate any memory." Do you mean that the algorithm can be implemented so that it only uses stack memory? Why is it useful to use stack memory only?
– Apass.Jack
12 hours ago
1
1
You might want to avoid allocating anything on the heap in case you catch an OutOfMemoryError.
– alex.knvl
12 hours ago
You might want to avoid allocating anything on the heap in case you catch an OutOfMemoryError.
– alex.knvl
12 hours ago
I see. Well, it happens the error in the question is StackOverflowError.
– Apass.Jack
32 mins ago
I see. Well, it happens the error in the question is StackOverflowError.
– Apass.Jack
32 mins ago
add a comment |
string contiguous_repeated_string(string A){
string r(1,A[0]);
for(int i=0;i<A.size();i++){
for(int l=2;l<=A.size();l++){
string s=A.substr(i,l);
if ((s+s).substr(1,2*s.size()-1).find(s)!=s.size()-1 and s.size()>r.size()){
r=s;
}
}
}
return r;
}
Call contiguous_repeated_string("abcdefgabcdabcdabcd")
you will get abcdabcdabcd
.
Call contiguous_repeated_string("ATCGATCGA")
you will get ATCGATCG
.
And then you can use KMP's Longest Proper Prefix Postfix array
to fold the resulting string with O(N)
.
Total time complexity is O(N^2)
.
New contributor
add a comment |
string contiguous_repeated_string(string A){
string r(1,A[0]);
for(int i=0;i<A.size();i++){
for(int l=2;l<=A.size();l++){
string s=A.substr(i,l);
if ((s+s).substr(1,2*s.size()-1).find(s)!=s.size()-1 and s.size()>r.size()){
r=s;
}
}
}
return r;
}
Call contiguous_repeated_string("abcdefgabcdabcdabcd")
you will get abcdabcdabcd
.
Call contiguous_repeated_string("ATCGATCGA")
you will get ATCGATCG
.
And then you can use KMP's Longest Proper Prefix Postfix array
to fold the resulting string with O(N)
.
Total time complexity is O(N^2)
.
New contributor
add a comment |
string contiguous_repeated_string(string A){
string r(1,A[0]);
for(int i=0;i<A.size();i++){
for(int l=2;l<=A.size();l++){
string s=A.substr(i,l);
if ((s+s).substr(1,2*s.size()-1).find(s)!=s.size()-1 and s.size()>r.size()){
r=s;
}
}
}
return r;
}
Call contiguous_repeated_string("abcdefgabcdabcdabcd")
you will get abcdabcdabcd
.
Call contiguous_repeated_string("ATCGATCGA")
you will get ATCGATCG
.
And then you can use KMP's Longest Proper Prefix Postfix array
to fold the resulting string with O(N)
.
Total time complexity is O(N^2)
.
New contributor
string contiguous_repeated_string(string A){
string r(1,A[0]);
for(int i=0;i<A.size();i++){
for(int l=2;l<=A.size();l++){
string s=A.substr(i,l);
if ((s+s).substr(1,2*s.size()-1).find(s)!=s.size()-1 and s.size()>r.size()){
r=s;
}
}
}
return r;
}
Call contiguous_repeated_string("abcdefgabcdabcdabcd")
you will get abcdabcdabcd
.
Call contiguous_repeated_string("ATCGATCGA")
you will get ATCGATCG
.
And then you can use KMP's Longest Proper Prefix Postfix array
to fold the resulting string with O(N)
.
Total time complexity is O(N^2)
.
New contributor
New contributor
answered 46 mins ago
Wu FuhengWu Fuheng
11
11
New contributor
New contributor
add a comment |
add a comment |
Li Haoyi is a new contributor. Be nice, and check out our Code of Conduct.
Li Haoyi is a new contributor. Be nice, and check out our Code of Conduct.
Li Haoyi is a new contributor. Be nice, and check out our Code of Conduct.
Li Haoyi is a new contributor. Be nice, and check out our Code of Conduct.
Thanks for contributing an answer to Computer Science Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Some of your past answers have not been well-received, and you're in danger of being blocked from answering.
Please pay close attention to the following guidance:
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcs.stackexchange.com%2fquestions%2f102463%2flooking-for-an-algorithm-to-identify-contiguous-repeated-series-of-lines-in-a-lo%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown